%%time
# Data Wranglers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Order 66
%config Completer.use_jedi = False
# Convinience
from glob import glob
from warnings import filterwarnings
from tqdm.notebook import tqdm
filterwarnings('ignore')
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
Wall time: 3.56 s
def get_housing_data(*, verbose: bool=True) -> "sample, test, train":
"""
Reads original CSV files from Directory structure.
"""
path = r"..\Data\*"
file_path = [ _ for _ in glob(path) ]
return [ pd.read_csv(_, verbose=verbose) for _ in file_path ]
def cat_analysis(var_name_1: str, var_name_2: str="SalePrice",* , Violin: bool=False)->None:
var1, var2 = train.loc[:,var_name_1].astype("category"), train.loc[:,var_name_2]
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(18,8))
sns.countplot(y=var1, ax=ax1)
if Violin:
sns.violinplot(y=var1, x=var2, ax=ax2)
else:
sns.boxplot(y=var1, x=var2, ax=ax2)
ax1.grid(True); ax2.grid(True)
ax1.set_title("Count Plot"); ax2.set_title("Box Plot")
plt.show()
def nan_count(var_name: str)->None:
nan_counts = train.loc[:, var_name].isna().sum()
print("{}: Number of NaN values = {:,}".format(var_name, nan_counts))
from scipy.stats import skew, kurtosis
def mean_reduction(var_name_1:str, var_name_2:str="SalePrice")->pd.DataFrame:
tabel = train.loc[:, [var_name_1, var_name_2]]
tabel_gb = tabel.groupby(var_name_1)[var_name_2].agg([np.mean, np.std, skew, kurtosis])
cat_mean, cat_std = tabel_gb.loc[:, "mean"].mean(), tabel_gb.loc[:, "std"].mean()
full_mean, full_std = tabel.loc[:, var_name_2].mean(), tabel.loc[:, var_name_2].std()
print("------MEAN-------")
print("Total Mean = {:0.4f}".format(full_mean))
print("Categ Mean = {:0.4f}".format(cat_mean))
print("Mean Reduction = {:0.4f} %".format((1-(cat_mean / full_mean)) * 100))
print("------STD--------")
print("Total Mean = {:0.4f}".format(full_std))
print("Categ Mean = {:0.4f}".format(cat_std))
print("Mean Reduction = {:0.4f} %".format((1-(cat_std / full_std)) * 100))
return tabel_gb
from scipy.stats import probplot
def num_analysis(*var_names:str)-> None:
if len(var_names) == 1:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(18,8))
sns.histplot(x=train.loc[:, var_names[0]], ax=ax1,kde=True)
probplot(x=train.loc[:, var_names[0]], plot=ax2, rvalue=True)
ax1.set_title("HistoGram")
ax1.grid(True); ax2.grid(True)
plt.show()
elif len(var_names) == 2:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(18,8))
sns.histplot(x=train.loc[:, var_names[0]], ax=ax1,kde=True)
sns.regplot(x=train.loc[:,var_names[0]],y=train.loc[:,var_names[1]],ax=ax2)
ax1.set_title("HistoGram: {}".format(var_names[0]))
ax2.set_title("Scatter Plot")
ax1.grid(True); ax2.grid(True)
plt.show()
from scipy.stats import chi2_contingency
def chi_test(var_name_1:str, var_name_2:str)->pd.DataFrame:
Cross_Tab = pd.crosstab(train.loc[:,var_name_1], train.loc[:,var_name_2])
chi_sq, p_val, *_ = chi2_contingency(Cross_Tab)
print("Chi Sq. Value = {}".format(chi_sq))
print("P Vlaue of Hypo = {}".format(p_val))
return Cross_Tab
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error
def tree_fit(var_name:str, *, criterion:str='mse', LOG:bool=False, PLOT:bool=False, PRINT:bool=False):
dt = DecisionTreeRegressor(criterion=criterion)
X = train.loc[:, var_name].values.reshape(-1, 1)
if LOG:
y = train.loc[:, "log_SalePrice"].values
else:
y = train.loc[:, "SalePrice"].values
dt.fit(X, y)
if PLOT:
fig, ax = plt.subplots(1,1, figsize=(18,12))
plot_tree(dt, ax=ax, filled=True, fontsize=10)
if PRINT:
print("Depth = {}".format(dt.get_depth()))
print("Leaves = {}".format(dt.get_n_leaves()))
y_pred = dt.predict(X)
return mean_squared_error(y, y_pred)
from sklearn.linear_model import LinearRegression
def line_fit(var_name:str, *, LOG:bool=True):
lr = LinearRegression(normalize=True)
X = train.loc[:, var_name].values.reshape(-1, 1)
if LOG:
y = train.loc[:, "log_SalePrice"].values
else:
y = train.loc[:, "SalePrice"].values
lr.fit(X, y)
y_pred = lr.predict(X)
return mean_squared_error(y, y_pred)
sample_sub, test, train = get_housing_data()
Tokenization took: 1.03 ms Type conversion took: 0.00 ms Parser memory cleanup took: 0.00 ms Tokenization took: 15.74 ms Type conversion took: 14.69 ms Parser memory cleanup took: 0.00 ms Tokenization took: 12.25 ms Type conversion took: 18.31 ms Parser memory cleanup took: 0.00 ms
train
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196.0 | Gd | TA | PConc | Gd | TA | No | GLQ | 706 | Unf | 0 | 150 | 856 | GasA | Ex | Y | SBrkr | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | NaN | Attchd | 2003.0 | RFn | 2 | 548 | TA | TA | Y | 0 | 61 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0.0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978 | Unf | 0 | 284 | 1262 | GasA | Ex | Y | SBrkr | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1976.0 | RFn | 2 | 460 | TA | TA | Y | 298 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162.0 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486 | Unf | 0 | 434 | 920 | GasA | Ex | Y | SBrkr | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | TA | Attchd | 2001.0 | RFn | 2 | 608 | TA | TA | Y | 0 | 42 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0.0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216 | Unf | 0 | 540 | 756 | GasA | Gd | Y | SBrkr | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 1998.0 | Unf | 3 | 642 | TA | TA | Y | 0 | 35 | 272 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350.0 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655 | Unf | 0 | 490 | 1145 | GasA | Ex | Y | SBrkr | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | TA | Attchd | 2000.0 | RFn | 3 | 836 | TA | TA | Y | 192 | 84 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | 1456 | 60 | RL | 62.0 | 7917 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 6 | 5 | 1999 | 2000 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | PConc | Gd | TA | No | Unf | 0 | Unf | 0 | 953 | 953 | GasA | Ex | Y | SBrkr | 953 | 694 | 0 | 1647 | 0 | 0 | 2 | 1 | 3 | 1 | TA | 7 | Typ | 1 | TA | Attchd | 1999.0 | RFn | 2 | 460 | TA | TA | Y | 0 | 40 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 8 | 2007 | WD | Normal | 175000 |
| 1456 | 1457 | 20 | RL | 85.0 | 13175 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NWAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1978 | 1988 | Gable | CompShg | Plywood | Plywood | Stone | 119.0 | TA | TA | CBlock | Gd | TA | No | ALQ | 790 | Rec | 163 | 589 | 1542 | GasA | TA | Y | SBrkr | 2073 | 0 | 0 | 2073 | 1 | 0 | 2 | 0 | 3 | 1 | TA | 7 | Min1 | 2 | TA | Attchd | 1978.0 | Unf | 2 | 500 | TA | TA | Y | 349 | 0 | 0 | 0 | 0 | 0 | NaN | MnPrv | NaN | 0 | 2 | 2010 | WD | Normal | 210000 |
| 1457 | 1458 | 70 | RL | 66.0 | 9042 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 9 | 1941 | 2006 | Gable | CompShg | CemntBd | CmentBd | None | 0.0 | Ex | Gd | Stone | TA | Gd | No | GLQ | 275 | Unf | 0 | 877 | 1152 | GasA | Ex | Y | SBrkr | 1188 | 1152 | 0 | 2340 | 0 | 0 | 2 | 0 | 4 | 1 | Gd | 9 | Typ | 2 | Gd | Attchd | 1941.0 | RFn | 1 | 252 | TA | TA | Y | 0 | 60 | 0 | 0 | 0 | 0 | NaN | GdPrv | Shed | 2500 | 5 | 2010 | WD | Normal | 266500 |
| 1458 | 1459 | 20 | RL | 68.0 | 9717 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 5 | 6 | 1950 | 1996 | Hip | CompShg | MetalSd | MetalSd | None | 0.0 | TA | TA | CBlock | TA | TA | Mn | GLQ | 49 | Rec | 1029 | 0 | 1078 | GasA | Gd | Y | FuseA | 1078 | 0 | 0 | 1078 | 1 | 0 | 1 | 0 | 2 | 1 | Gd | 5 | Typ | 0 | NaN | Attchd | 1950.0 | Unf | 1 | 240 | TA | TA | Y | 366 | 0 | 112 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 142125 |
| 1459 | 1460 | 20 | RL | 75.0 | 9937 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Edwards | Norm | Norm | 1Fam | 1Story | 5 | 6 | 1965 | 1965 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | Gd | TA | CBlock | TA | TA | No | BLQ | 830 | LwQ | 290 | 136 | 1256 | GasA | Gd | Y | SBrkr | 1256 | 0 | 0 | 1256 | 1 | 0 | 1 | 1 | 3 | 1 | TA | 6 | Typ | 0 | NaN | Attchd | 1965.0 | Fin | 1 | 276 | TA | TA | Y | 736 | 68 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2008 | WD | Normal | 147500 |
1460 rows × 81 columns
train.select_dtypes("object")
| MSZoning | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinType2 | Heating | HeatingQC | CentralAir | Electrical | KitchenQual | Functional | FireplaceQu | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | PoolQC | Fence | MiscFeature | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | Gable | CompShg | VinylSd | VinylSd | BrkFace | Gd | TA | PConc | Gd | TA | No | GLQ | Unf | GasA | Ex | Y | SBrkr | Gd | Typ | NaN | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 1 | RL | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | Gable | CompShg | MetalSd | MetalSd | None | TA | TA | CBlock | Gd | TA | Gd | ALQ | Unf | GasA | Ex | Y | SBrkr | TA | Typ | TA | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 2 | RL | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | Gable | CompShg | VinylSd | VinylSd | BrkFace | Gd | TA | PConc | Gd | TA | Mn | GLQ | Unf | GasA | Ex | Y | SBrkr | Gd | Typ | TA | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 3 | RL | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | Gable | CompShg | Wd Sdng | Wd Shng | None | TA | TA | BrkTil | TA | Gd | No | ALQ | Unf | GasA | Gd | Y | SBrkr | Gd | Typ | Gd | Detchd | Unf | TA | TA | Y | NaN | NaN | NaN | WD | Abnorml |
| 4 | RL | Pave | NaN | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | Gable | CompShg | VinylSd | VinylSd | BrkFace | Gd | TA | PConc | Gd | TA | Av | GLQ | Unf | GasA | Ex | Y | SBrkr | Gd | Typ | TA | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | Gable | CompShg | VinylSd | VinylSd | None | TA | TA | PConc | Gd | TA | No | Unf | Unf | GasA | Ex | Y | SBrkr | TA | Typ | TA | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 1456 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NWAmes | Norm | Norm | 1Fam | 1Story | Gable | CompShg | Plywood | Plywood | Stone | TA | TA | CBlock | Gd | TA | No | ALQ | Rec | GasA | TA | Y | SBrkr | TA | Min1 | TA | Attchd | Unf | TA | TA | Y | NaN | MnPrv | NaN | WD | Normal |
| 1457 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | Gable | CompShg | CemntBd | CmentBd | None | Ex | Gd | Stone | TA | Gd | No | GLQ | Unf | GasA | Ex | Y | SBrkr | Gd | Typ | Gd | Attchd | RFn | TA | TA | Y | NaN | GdPrv | Shed | WD | Normal |
| 1458 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | Hip | CompShg | MetalSd | MetalSd | None | TA | TA | CBlock | TA | TA | Mn | GLQ | Rec | GasA | Gd | Y | FuseA | Gd | Typ | NaN | Attchd | Unf | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 1459 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Edwards | Norm | Norm | 1Fam | 1Story | Gable | CompShg | HdBoard | HdBoard | None | Gd | TA | CBlock | TA | TA | No | BLQ | LwQ | GasA | Gd | Y | SBrkr | TA | Typ | NaN | Attchd | Fin | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
1460 rows × 43 columns
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 1460 non-null int64 1 MSSubClass 1460 non-null int64 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 LotArea 1460 non-null int64 5 Street 1460 non-null object 6 Alley 91 non-null object 7 LotShape 1460 non-null object 8 LandContour 1460 non-null object 9 Utilities 1460 non-null object 10 LotConfig 1460 non-null object 11 LandSlope 1460 non-null object 12 Neighborhood 1460 non-null object 13 Condition1 1460 non-null object 14 Condition2 1460 non-null object 15 BldgType 1460 non-null object 16 HouseStyle 1460 non-null object 17 OverallQual 1460 non-null int64 18 OverallCond 1460 non-null int64 19 YearBuilt 1460 non-null int64 20 YearRemodAdd 1460 non-null int64 21 RoofStyle 1460 non-null object 22 RoofMatl 1460 non-null object 23 Exterior1st 1460 non-null object 24 Exterior2nd 1460 non-null object 25 MasVnrType 1452 non-null object 26 MasVnrArea 1452 non-null float64 27 ExterQual 1460 non-null object 28 ExterCond 1460 non-null object 29 Foundation 1460 non-null object 30 BsmtQual 1423 non-null object 31 BsmtCond 1423 non-null object 32 BsmtExposure 1422 non-null object 33 BsmtFinType1 1423 non-null object 34 BsmtFinSF1 1460 non-null int64 35 BsmtFinType2 1422 non-null object 36 BsmtFinSF2 1460 non-null int64 37 BsmtUnfSF 1460 non-null int64 38 TotalBsmtSF 1460 non-null int64 39 Heating 1460 non-null object 40 HeatingQC 1460 non-null object 41 CentralAir 1460 non-null object 42 Electrical 1459 non-null object 43 1stFlrSF 1460 non-null int64 44 2ndFlrSF 1460 non-null int64 45 LowQualFinSF 1460 non-null int64 46 GrLivArea 1460 non-null int64 47 BsmtFullBath 1460 non-null int64 48 BsmtHalfBath 1460 non-null int64 49 FullBath 1460 non-null int64 50 HalfBath 1460 non-null int64 51 BedroomAbvGr 1460 non-null int64 52 KitchenAbvGr 1460 non-null int64 53 KitchenQual 1460 non-null object 54 TotRmsAbvGrd 1460 non-null int64 55 Functional 1460 non-null object 56 Fireplaces 1460 non-null int64 57 FireplaceQu 770 non-null object 58 GarageType 1379 non-null object 59 GarageYrBlt 1379 non-null float64 60 GarageFinish 1379 non-null object 61 GarageCars 1460 non-null int64 62 GarageArea 1460 non-null int64 63 GarageQual 1379 non-null object 64 GarageCond 1379 non-null object 65 PavedDrive 1460 non-null object 66 WoodDeckSF 1460 non-null int64 67 OpenPorchSF 1460 non-null int64 68 EnclosedPorch 1460 non-null int64 69 3SsnPorch 1460 non-null int64 70 ScreenPorch 1460 non-null int64 71 PoolArea 1460 non-null int64 72 PoolQC 7 non-null object 73 Fence 281 non-null object 74 MiscFeature 54 non-null object 75 MiscVal 1460 non-null int64 76 MoSold 1460 non-null int64 77 YrSold 1460 non-null int64 78 SaleType 1460 non-null object 79 SaleCondition 1460 non-null object 80 SalePrice 1460 non-null int64 dtypes: float64(3), int64(35), object(43) memory usage: 924.0+ KB
# nan_data
train.isna().sum().loc[ train.isna().sum() > 0 ]
LotFrontage 259 Alley 1369 MasVnrType 8 MasVnrArea 8 BsmtQual 37 BsmtCond 37 BsmtExposure 38 BsmtFinType1 37 BsmtFinType2 38 Electrical 1 FireplaceQu 690 GarageType 81 GarageYrBlt 81 GarageFinish 81 GarageQual 81 GarageCond 81 PoolQC 1453 Fence 1179 MiscFeature 1406 dtype: int64
%%time
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(18,9))
sns.barplot(x=train.isna().sum().loc[ train.isna().sum() > 0 ], y=train.isna().sum().loc[ train.isna().sum() > 0 ].index,ax=ax1)
sns.barplot(x=train.isna().sum().loc[ train.isna().sum()>0 ]*100/train.shape[0], y=train.isna().sum().loc[ train.isna().sum()>0 ].index,ax=ax2)
ax1.grid(True); ax2.grid(True)
ax1.set_title("NaN Counts"); ax2.set_title("NaN Percent")
plt.show()
Wall time: 1.96 s
%%time
fig, ax = plt.subplots(1,1, figsize=(21,10))
sns.heatmap(train.isna().loc[ :, train.isna().sum() > 0 ], yticklabels=False, ax=ax)
plt.show()
Wall time: 1.06 s
num_analysis("SalePrice")
train["log_SalePrice"] = np.log(train["SalePrice"])
num_analysis("log_SalePrice")
[ _ for _ in train.columns if ("SF" in _ or "Area" in _ )]
['LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'PoolArea']
(train.GrLivArea - train.loc[:, "1stFlrSF"] - train.loc[:, "2ndFlrSF"]).describe()
count 1460.000000 mean 5.844521 std 48.623081 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 572.000000 dtype: float64
(train.GrLivArea - train.loc[:, "1stFlrSF"] - train.loc[:, "2ndFlrSF"]).loc[(train.GrLivArea - train.loc[:, "1stFlrSF"] - train.loc[:, "2ndFlrSF"])!=0]
51 360 88 513 125 234 170 528 185 572 187 144 197 392 198 371 263 390 267 420 406 473 589 156 635 515 729 360 829 80 831 80 868 53 873 232 883 481 945 120 1009 514 1031 397 1173 479 1349 205 1364 80 1440 384 dtype: int64
# print([_ for _ in train.columns if "SF" in _ and _ is not "Total_SF" ])
SF_List = ['TotalBsmtSF','GrLivArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','GarageArea']
(train.LowQualFinSF - (train.GrLivArea - train.loc[:, "1stFlrSF"] - train.loc[:, "2ndFlrSF"])).describe()
count 1460.0 mean 0.0 std 0.0 min 0.0 25% 0.0 50% 0.0 75% 0.0 max 0.0 dtype: float64
train["Total_SF"] = train.loc[:, SF_List].sum(axis=1)
num_analysis("Total_SF")
num_analysis("Total_SF", "SalePrice")
from scipy.stats import pearsonr
pearsonr(train.SalePrice, train.Total_SF)
(0.8100877726165194, 0.0)
train.loc[:, SF_List].corr() * 100
| TotalBsmtSF | GrLivArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | GarageArea | |
|---|---|---|---|---|---|---|---|---|
| TotalBsmtSF | 100.000000 | 45.486820 | 23.201861 | 24.726375 | -9.547774 | 3.738373 | 8.448899 | 48.666546 |
| GrLivArea | 45.486820 | 100.000000 | 24.743282 | 33.022396 | 0.911321 | 2.064319 | 10.151040 | 46.899748 |
| WoodDeckSF | 23.201861 | 24.743282 | 100.000000 | 5.866061 | -12.598889 | -3.277063 | -7.418135 | 22.466631 |
| OpenPorchSF | 24.726375 | 33.022396 | 5.866061 | 100.000000 | -9.307932 | -0.584250 | 7.430394 | 24.143467 |
| EnclosedPorch | -9.547774 | 0.911321 | -12.598889 | -9.307932 | 100.000000 | -3.730528 | -8.286424 | -12.177672 |
| 3SsnPorch | 3.738373 | 2.064319 | -3.277063 | -0.584250 | -3.730528 | 100.000000 | -3.143585 | 3.508670 |
| ScreenPorch | 8.448899 | 10.151040 | -7.418135 | 7.430394 | -8.286424 | -3.143585 | 100.000000 | 5.141176 |
| GarageArea | 48.666546 | 46.899748 | 22.466631 | 24.143467 | -12.177672 | 3.508670 | 5.141176 | 100.000000 |
fig, ax = plt.subplots(1,1, figsize=(15, 15))
sns.heatmap(train.loc[:, SF_List].corr()*100, square=True, ax=ax, fmt="0.2f", annot=True)
plt.show()
train.loc[:, [_ for _ in train.columns if "SF" in _ ]].corr().loc[:, "Total_SF"].sort_values(ascending=False) * 100
Total_SF 100.000000 TotalBsmtSF 80.148655 1stFlrSF 78.269775 BsmtFinSF1 41.958503 OpenPorchSF 39.290031 WoodDeckSF 38.394627 BsmtUnfSF 34.424422 2ndFlrSF 33.423223 BsmtFinSF2 5.042387 LowQualFinSF 4.346126 Name: Total_SF, dtype: float64
for _ in tqdm([_ for _ in train.columns if "SF" in _ and "Total_SF" not in _ ]):
sns.jointplot(x=train.loc[:, "Total_SF"], y=train.loc[:, _]); plt.show()
train["Age_At_Sale"] = train.loc[:, "YrSold"] - train.loc[:, "YearBuilt"]
num_analysis("Age_At_Sale")
num_analysis("Age_At_Sale", "SalePrice")
[ _ for _ in train.columns if "Y" in _ or "Age" in _ ]
['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold', 'Age_At_Sale']
for _ in tqdm([ _ for _ in train.columns if "Y" in _ or "Age" in _ ]):
sns.jointplot(x=train.loc[:, _], y=train.loc[:, "Age_At_Sale"])
plt.grid(); plt.show()
train["Remodel_Age"] = train.YrSold - train.YearRemodAdd
train["Remodel_Done"] = (train.YearRemodAdd - train.YearBuilt).apply(lambda x: 1 if x>0 else 0)
train.loc[:, [ _ for _ in train.columns if "Bath" in _ ]]
| BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | |
|---|---|---|---|---|
| 0 | 1 | 0 | 2 | 1 |
| 1 | 0 | 1 | 2 | 0 |
| 2 | 1 | 0 | 2 | 1 |
| 3 | 1 | 0 | 1 | 0 |
| 4 | 1 | 0 | 2 | 1 |
| ... | ... | ... | ... | ... |
| 1455 | 0 | 0 | 2 | 1 |
| 1456 | 1 | 0 | 2 | 0 |
| 1457 | 0 | 0 | 2 | 0 |
| 1458 | 1 | 0 | 1 | 0 |
| 1459 | 1 | 0 | 1 | 1 |
1460 rows × 4 columns
train["Half_bath"] = train.loc[:, [ _ for _ in train.columns if "Bath" in _ and "Half" in _ ]].sum(axis=1)
train["Full_bath"] = train.loc[:, [ _ for _ in train.columns if "Bath" in _ and "Half" not in _ ]].sum(axis=1)
train["All_bath"] = train["Full_bath"] + train["Half_bath"]
train.loc[:,["Full_bath", "Half_bath"]].loc[train.loc[:, "Full_bath"] == 0]
| Full_bath | Half_bath | |
|---|---|---|
| 597 | 0 | 4 |
train.MSSubClass.value_counts()
20 536 60 299 50 144 120 87 30 69 160 63 70 60 80 58 90 52 190 30 85 20 75 16 45 12 180 10 40 4 Name: MSSubClass, dtype: int64
cat_analysis("MSSubClass")
_ = mean_reduction("MSSubClass")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 154044.0011 Mean Reduction = 14.8557 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 49546.4960 Mean Reduction = 37.6323 %
MSS_dict = {}
for obj, key in enumerate(list(_.sort_values(by="mean").index)):
MSS_dict[key] = obj
train.MSSubClass = train.MSSubClass.map(MSS_dict)
cat_analysis("MSSubClass")
cat_analysis("MSZoning")
_ = mean_reduction("MSZoning")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 147484.4523 Mean Reduction = 18.4814 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 50232.5807 Mean Reduction = 36.7686 %
MSZoning_dict = {}
for obj, key in enumerate(list(_.sort_values("mean").index)):
MSZoning_dict[key] = obj
train.MSZoning = train.MSZoning.map(MSZoning_dict)
cat_analysis("MSZoning")
nan_count("LotFrontage")
LotFrontage: Number of NaN values = 259
train.corr().loc["LotFrontage"].sort_values(ascending=False) * 100
LotFrontage 100.000000 Total_SF 46.515483 1stFlrSF 45.718100 LotArea 42.609502 GrLivArea 40.279741 TotalBsmtSF 39.207458 log_SalePrice 35.587847 TotRmsAbvGrd 35.209595 SalePrice 35.179910 GarageArea 34.499672 MSSubClass 30.581128 GarageCars 28.569092 MSZoning 27.143053 Fireplaces 26.663948 BedroomAbvGr 26.316992 OverallQual 25.164578 BsmtFinSF1 23.363317 Full_bath 21.830142 PoolArea 20.616678 All_bath 20.180394 FullBath 19.876868 MasVnrArea 19.345806 OpenPorchSF 15.197223 BsmtUnfSF 13.264374 YearBuilt 12.334947 BsmtFullBath 10.094857 YearRemodAdd 8.886557 WoodDeckSF 8.852093 2ndFlrSF 8.017727 GarageYrBlt 7.024978 3SsnPorch 7.002923 HalfBath 5.353185 BsmtFinSF2 4.989968 Half_bath 4.543870 ScreenPorch 4.138279 LowQualFinSF 3.846853 MoSold 1.119995 EnclosedPorch 1.070034 YrSold 0.744959 MiscVal 0.336756 KitchenAbvGr -0.606883 BsmtHalfBath -0.723430 Id -1.060069 Remodel_Done -4.025484 OverallCond -5.921345 Remodel_Age -8.841731 Age_At_Sale -12.284789 Name: LotFrontage, dtype: float64
nan_count("LotArea")
LotArea: Number of NaN values = 0
num_analysis("LotArea")
num_analysis("LotArea", "SalePrice")
train.Street.value_counts(dropna=False)
Pave 1454 Grvl 6 Name: Street, dtype: int64
train.Alley.value_counts(dropna=False)
NaN 1369 Grvl 50 Pave 41 Name: Alley, dtype: int64
(train.Street + train.Alley.fillna("")).value_counts()
Pave 1363 PaveGrvl 50 PavePave 41 Grvl 6 dtype: int64
train["Road"] = train.Street + train.Alley.fillna("")
cat_analysis("Road")
road_dict = {}
for obj, key in enumerate(list(mean_reduction("Road").sort_values("mean").index)):
road_dict[key] = obj
------MEAN------- Total Mean = 180921.1959 Categ Mean = 151024.1894 Mean Reduction = 16.5249 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 54816.7433 Mean Reduction = 30.9982 %
train.Road = train.Road.map(road_dict)
nan_count("LotShape")
LotShape: Number of NaN values = 0
train.LotShape.value_counts()
Reg 925 IR1 484 IR2 41 IR3 10 Name: LotShape, dtype: int64
cat_analysis("LotShape")
LotShape_dict = {}
for obj, key in enumerate(mean_reduction("LotShape").sort_values("mean").index):
LotShape_dict[key] = obj
------MEAN------- Total Mean = 180921.1959 Categ Mean = 206681.5874 Mean Reduction = -14.2385 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 84435.4197 Mean Reduction = -6.2849 %
train.LotShape = train.LotShape.map(LotShape_dict)
nan_count("LandContour")
LandContour: Number of NaN values = 0
train.LandContour.value_counts()
Lvl 1311 Bnk 63 HLS 50 Low 36 Name: LandContour, dtype: int64
cat_analysis("LandContour")
LandContour_dict = {}
for obj, key in enumerate(mean_reduction("LandContour").sort_values("mean").index):
LandContour_dict[key] = obj
------MEAN------- Total Mean = 180921.1959 Categ Mean = 189620.7193 Mean Reduction = -4.8085 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 78387.5763 Mean Reduction = 1.3279 %
train.LandContour = train.LandContour.map(LandContour_dict)
nan_count("Utilities")
Utilities: Number of NaN values = 0
train.Utilities.value_counts()
AllPub 1459 NoSeWa 1 Name: Utilities, dtype: int64
cat_analysis("Utilities")
mean_reduction("Utilities")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 159225.4784 Mean Reduction = 11.9918 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 79461.5998 Mean Reduction = -0.0240 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| Utilities | ||||
| AllPub | 180950.95682 | 79461.599814 | 1.879863 | 6.504315 |
| NoSeWa | 137500.00000 | NaN | 0.000000 | -3.000000 |
nan_count("LotConfig")
LotConfig: Number of NaN values = 0
train.LotConfig.value_counts()
Inside 1052 Corner 263 CulDSac 94 FR2 47 FR3 4 Name: LotConfig, dtype: int64
cat_analysis("LotConfig")
LotConfig_dict = {}
for obj, key in enumerate(mean_reduction("LotConfig").sort_values("mean").index):
LotConfig_dict[key] = obj
train.LotConfig = train.LotConfig.map(LotConfig_dict)
------MEAN------- Total Mean = 180921.1959 Categ Mean = 193765.1330 Mean Reduction = -7.0992 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 79035.7086 Mean Reduction = 0.5121 %
cat_analysis("LotConfig")
nan_count("LandSlope")
LandSlope: Number of NaN values = 0
train.LandSlope.value_counts()
Gtl 1382 Mod 65 Sev 13 Name: LandSlope, dtype: int64
train.LandSlope.value_counts(normalize=True).round(4) * 100
Gtl 94.66 Mod 4.45 Sev 0.89 Name: LandSlope, dtype: float64
cat_analysis("LandSlope")
mean_reduction("LandSlope")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 193690.0563 Mean Reduction = -7.0577 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 85924.8832 Mean Reduction = -8.1598 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| LandSlope | ||||
| Gtl | 179956.799566 | 78669.369151 | 1.959097 | 7.038784 |
| Mod | 196734.138462 | 92375.358116 | 0.972794 | 1.521768 |
| Sev | 204379.230769 | 86729.922409 | 0.267832 | -0.563832 |
nan_count("Neighborhood")
Neighborhood: Number of NaN values = 0
train.Neighborhood.value_counts()
NAmes 225 CollgCr 150 OldTown 113 Edwards 100 Somerst 86 Gilbert 79 NridgHt 77 Sawyer 74 NWAmes 73 SawyerW 59 BrkSide 58 Crawfor 51 Mitchel 49 NoRidge 41 Timber 38 IDOTRR 37 ClearCr 28 SWISU 25 StoneBr 25 Blmngtn 17 MeadowV 17 BrDale 16 Veenker 11 NPkVill 9 Blueste 2 Name: Neighborhood, dtype: int64
train.Neighborhood.value_counts(normalize=True).round(4) * 100
NAmes 15.41 CollgCr 10.27 OldTown 7.74 Edwards 6.85 Somerst 5.89 Gilbert 5.41 NridgHt 5.27 Sawyer 5.07 NWAmes 5.00 SawyerW 4.04 BrkSide 3.97 Crawfor 3.49 Mitchel 3.36 NoRidge 2.81 Timber 2.60 IDOTRR 2.53 ClearCr 1.92 SWISU 1.71 StoneBr 1.71 Blmngtn 1.16 MeadowV 1.16 BrDale 1.10 Veenker 0.75 NPkVill 0.62 Blueste 0.14 Name: Neighborhood, dtype: float64
cat_analysis("Neighborhood")
neigh_dict = {}
for obj, key in enumerate(mean_reduction("Neighborhood").sort_values("mean").index):
neigh_dict[key] = obj
------MEAN------- Total Mean = 180921.1959 Categ Mean = 183964.8533 Mean Reduction = -1.6823 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 48571.1308 Mean Reduction = 38.8600 %
train.Neighborhood = train.Neighborhood.map(neigh_dict)
nan_count("Condition1")
Condition1: Number of NaN values = 0
train.Condition1.value_counts()
Norm 1260 Feedr 81 Artery 48 RRAn 26 PosN 19 RRAe 11 PosA 8 RRNn 5 RRNe 2 Name: Condition1, dtype: int64
cat_analysis("Condition1")
cond1_dict = {}
for obj, key in enumerate(mean_reduction("Condition1").sort_values("mean").index):
cond1_dict[key] = obj
train.Condition1 = train.Condition1.map(cond1_dict)
------MEAN------- Total Mean = 180921.1959 Categ Mean = 181007.6073 Mean Reduction = -0.0478 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 55175.7416 Mean Reduction = 30.5463 %
nan_count("Condition2")
Condition2: Number of NaN values = 0
train.Condition2.value_counts()
Norm 1445 Feedr 6 Artery 2 PosN 2 RRNn 2 RRAn 1 RRAe 1 PosA 1 Name: Condition2, dtype: int64
cat_analysis("Condition2")
cond2_dict = {}
for obj, key in enumerate(mean_reduction("Condition2").sort_values("mean").index):
cond2_dict[key] = obj
train.Condition2 = train.Condition2.map(cond2_dict)
------MEAN------- Total Mean = 180921.1959 Categ Mean = 180295.7590 Mean Reduction = 0.3457 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 62139.1704 Mean Reduction = 21.7810 %
nan_count("BldgType")
BldgType: Number of NaN values = 0
train.BldgType.value_counts()
1Fam 1220 TwnhsE 114 Duplex 52 Twnhs 43 2fmCon 31 Name: BldgType, dtype: int64
cat_analysis("BldgType")
BldgType_dict = {}
for obj, key in enumerate(mean_reduction("BldgType").sort_values("mean").index):
BldgType_dict[key] = obj
train.BldgType = train.BldgType.map(BldgType_dict)
------MEAN------- Total Mean = 180921.1959 Categ Mean = 153121.6225 Mean Reduction = 15.3656 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 49515.9257 Mean Reduction = 37.6707 %
nan_count("HouseStyle")
HouseStyle: Number of NaN values = 0
cat_analysis("HouseStyle")
HouseStyle_dict = {}
for obj, key in enumerate(mean_reduction("HouseStyle").sort_values("mean").index):
HouseStyle_dict[key] = obj
train.HouseStyle = train.HouseStyle.map(HouseStyle_dict)
------MEAN------- Total Mean = 180921.1959 Categ Mean = 164804.5499 Mean Reduction = 8.9081 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 61080.1884 Mean Reduction = 23.1140 %
cat_analysis("OverallQual")
mean_reduction("OverallQual")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 188149.4492 Mean Reduction = -3.9952 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 49593.5331 Mean Reduction = 37.5730 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| OverallQual | ||||
| 1 | 50150.000000 | 15344.217152 | 0.000000 | -2.000000 |
| 2 | 51770.333333 | 14254.200796 | -0.707107 | -1.500000 |
| 3 | 87473.750000 | 24688.567399 | 0.154457 | -0.080320 |
| 4 | 108420.655172 | 29022.003886 | 0.961030 | 4.838197 |
| 5 | 133523.347607 | 27107.330927 | 0.332553 | 1.111189 |
| 6 | 161603.034759 | 36090.182933 | 0.357109 | 0.406551 |
| 7 | 207716.423197 | 44466.259414 | 0.782693 | 1.280099 |
| 8 | 274735.535714 | 63898.902253 | 0.740566 | 1.318193 |
| 9 | 367513.023256 | 81278.174849 | 1.314371 | 1.610063 |
| 10 | 438588.388889 | 159785.491058 | 0.380504 | -0.040923 |
cat_analysis("OverallQual","log_SalePrice")
mean_reduction("OverallQual","log_SalePrice")
------MEAN------- Total Mean = 12.0241 Categ Mean = 11.8700 Mean Reduction = 1.2809 % ------STD-------- Total Mean = 0.3995 Categ Mean = 0.2694 Mean Reduction = 32.5562 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| OverallQual | ||||
| 1 | 10.798804 | 0.310879 | 0.000000 | -2.000000 |
| 2 | 10.825383 | 0.306082 | -0.707107 | -1.500000 |
| 3 | 11.337474 | 0.306716 | -0.743512 | 0.689926 |
| 4 | 11.557148 | 0.280295 | -0.818844 | 2.715856 |
| 5 | 11.780657 | 0.211002 | -0.609478 | 1.534935 |
| 6 | 11.967308 | 0.229888 | -0.404589 | 0.440246 |
| 7 | 12.221773 | 0.210898 | -0.050121 | 0.889456 |
| 8 | 12.497191 | 0.231313 | -0.143142 | 0.637664 |
| 9 | 12.793268 | 0.203521 | 0.727296 | 0.556746 |
| 10 | 12.921305 | 0.403459 | -0.745042 | 0.518987 |
chi_test("OverallQual", "OverallCond")
Chi Sq. Value = 1322.0585284633246 P Vlaue of Hypo = 4.3223202616186964e-229
| OverallCond | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
|---|---|---|---|---|---|---|---|---|---|
| OverallQual | |||||||||
| 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 2 | 0 | 1 | 0 | 0 | 0 | 0 |
| 3 | 0 | 2 | 3 | 6 | 2 | 5 | 1 | 1 | 0 |
| 4 | 0 | 1 | 5 | 20 | 44 | 24 | 19 | 2 | 1 |
| 5 | 0 | 1 | 7 | 21 | 142 | 104 | 89 | 29 | 4 |
| 6 | 0 | 0 | 6 | 6 | 180 | 83 | 66 | 30 | 3 |
| 7 | 0 | 0 | 1 | 3 | 247 | 28 | 23 | 8 | 9 |
| 8 | 0 | 0 | 0 | 1 | 148 | 7 | 7 | 2 | 3 |
| 9 | 0 | 1 | 0 | 0 | 42 | 0 | 0 | 0 | 0 |
| 10 | 0 | 0 | 0 | 0 | 15 | 1 | 0 | 0 | 2 |
nan_count("YearBuilt")
YearBuilt: Number of NaN values = 0
num_analysis("YearBuilt")
num_analysis("YearBuilt", "SalePrice")
nan_count("YearRemodAdd")
YearRemodAdd: Number of NaN values = 0
num_analysis("YearRemodAdd")
num_analysis("YearRemodAdd", "SalePrice")
sns.jointplot(x=train.loc[:, "YearRemodAdd"], y=train.loc[:, "Remodel_Age"])
<seaborn.axisgrid.JointGrid at 0x1ee87638d90>
nan_count("RoofStyle")
RoofStyle: Number of NaN values = 0
train.RoofStyle.value_counts()
Gable 1141 Hip 286 Flat 13 Gambrel 11 Mansard 7 Shed 2 Name: RoofStyle, dtype: int64
cat_analysis("RoofStyle")
RoofStyle_dict = {}
for obj, key in enumerate(mean_reduction("RoofStyle").sort_values("mean").index):
RoofStyle_dict[key] = obj
------MEAN------- Total Mean = 180921.1959 Categ Mean = 189921.4015 Mean Reduction = -4.9747 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 69162.0719 Mean Reduction = 12.9407 %
train.RoofStyle = train.RoofStyle.map(RoofStyle_dict)
nan_count("RoofMatl")
RoofMatl: Number of NaN values = 0
train.RoofMatl.value_counts()
CompShg 1434 Tar&Grv 11 WdShngl 6 WdShake 5 Membran 1 Roll 1 ClyTile 1 Metal 1 Name: RoofMatl, dtype: int64
## Drop
cat_analysis("RoofMatl")
- Drop The RoofMatl Variable
- It Does Not Contain Much Information
mean_reduction("RoofMatl")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 214420.0054 Mean Reduction = -18.5157 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 96585.0824 Mean Reduction = -21.5786 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| RoofMatl | ||||
| ClyTile | 160000.000000 | NaN | 0.000000 | -3.000000 |
| CompShg | 179803.679219 | 77722.388636 | 1.749238 | 5.399991 |
| Membran | 241500.000000 | NaN | 0.000000 | -3.000000 |
| Metal | 180000.000000 | NaN | 0.000000 | -3.000000 |
| Roll | 137000.000000 | NaN | 0.000000 | -3.000000 |
| Tar&Grv | 185406.363636 | 65430.141720 | 0.091583 | -1.288276 |
| WdShake | 241400.000000 | 36218.779659 | -0.217368 | -0.919664 |
| WdShngl | 390250.000000 | 206969.019421 | 0.876708 | -0.321796 |
nan_count("Exterior1st")
Exterior1st: Number of NaN values = 0
train.Exterior1st.value_counts()
VinylSd 515 HdBoard 222 MetalSd 220 Wd Sdng 206 Plywood 108 CemntBd 61 BrkFace 50 WdShing 26 Stucco 25 AsbShng 20 Stone 2 BrkComm 2 AsphShn 1 CBlock 1 ImStucc 1 Name: Exterior1st, dtype: int64
cat_analysis("Exterior1st")
Exterior1st_dict = {}
for obj, key in enumerate(mean_reduction("Exterior1st").sort_values("mean").index):
Exterior1st_dict[key] = obj
train.Exterior1st = train.Exterior1st.map(Exterior1st_dict)
------MEAN------- Total Mean = 180921.1959 Categ Mean = 166387.3891 Mean Reduction = 8.0332 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 64267.2633 Mean Reduction = 19.1022 %
train.Exterior1st.value_counts().loc[train.Exterior1st.value_counts() <= 2].index
Int64Index([0, 13, 1, 2, 14], dtype='int64')
Exterior1st_dict = {}
for obj, key in enumerate(mean_reduction("Exterior1st").sort_values("mean").index):
Exterior1st_dict[key] = obj
------MEAN------- Total Mean = 180921.1959 Categ Mean = 166387.3891 Mean Reduction = 8.0332 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 64267.2633 Mean Reduction = 19.1022 %
Exterior1st_dict[0] = 0
Exterior1st_dict[13] = 0
Exterior1st_dict[1] = 0
Exterior1st_dict[2] = 0
Exterior1st_dict[14] = 0
train.Exterior1st = train.Exterior1st.map(Exterior1st_dict)
cat_analysis("Exterior1st")
mean_reduction("Exterior1st")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 169106.1800 Mean Reduction = 6.5305 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 73666.7450 Mean Reduction = 7.2704 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| Exterior1st | ||||
| 0 | 160857.142857 | 94988.470478 | 0.305918 | -1.695223 |
| 3 | 107385.550000 | 33756.480660 | -0.379896 | -0.467595 |
| 4 | 149422.177273 | 54776.408990 | 1.835724 | 4.259495 |
| 5 | 149841.645631 | 71130.448152 | 3.719943 | 24.618274 |
| 6 | 150655.076923 | 72507.530741 | 2.127332 | 3.897955 |
| 7 | 162990.000000 | 83307.246529 | 1.357122 | 1.421784 |
| 8 | 163077.450450 | 66305.714164 | 4.955329 | 37.279846 |
| 9 | 175942.379630 | 49497.383293 | 0.993752 | 1.134094 |
| 10 | 194573.000000 | 82841.914370 | 0.790106 | 0.166320 |
| 11 | 213732.900971 | 80646.975785 | 1.292298 | 2.820233 |
| 12 | 231690.655738 | 120575.621318 | 0.573872 | -0.180628 |
train.Exterior2nd.value_counts()
VinylSd 504 MetalSd 214 HdBoard 207 Wd Sdng 197 Plywood 142 CmentBd 60 Wd Shng 38 Stucco 26 BrkFace 25 AsbShng 20 ImStucc 10 Brk Cmn 7 Stone 5 AsphShn 3 CBlock 1 Other 1 Name: Exterior2nd, dtype: int64
cat_analysis("Exterior2nd")
train.Exterior2nd.value_counts(normalize=True).round(4) * 100
VinylSd 34.52 MetalSd 14.66 HdBoard 14.18 Wd Sdng 13.49 Plywood 9.73 CmentBd 4.11 Wd Shng 2.60 Stucco 1.78 BrkFace 1.71 AsbShng 1.37 ImStucc 0.68 Brk Cmn 0.48 Stone 0.34 AsphShn 0.21 CBlock 0.07 Other 0.07 Name: Exterior2nd, dtype: float64
train.Exterior2nd = train.Exterior2nd.map(
{
"VinylSd": "VinylSd",
"MetalSd": "MetalSd",
"HdBoard": "HdBoard",
"Wd Sdng": "Wd Sdng",
"Plywood": "Plywood",
}
)
train.Exterior2nd.fillna("REST", inplace=True)
train.Exterior2nd.value_counts()
VinylSd 504 MetalSd 214 HdBoard 207 Wd Sdng 197 REST 196 Plywood 142 Name: Exterior2nd, dtype: int64
train.Exterior2nd.value_counts(normalize=True).round(4)* 100
VinylSd 34.52 MetalSd 14.66 HdBoard 14.18 Wd Sdng 13.49 REST 13.42 Plywood 9.73 Name: Exterior2nd, dtype: float64
cat_analysis("Exterior2nd")
Exterior2nd_dict = {}
for obj, key in enumerate(mean_reduction("Exterior2nd").sort_values("Exterior2nd").index):
Exterior2nd_dict[key] = obj
train.Exterior2nd = train.Exterior2nd.map(Exterior2nd_dict)
------MEAN------- Total Mean = 180921.1959 Categ Mean = 172184.2659 Mean Reduction = 4.8291 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 69882.8090 Mean Reduction = 12.0335 %
cat_analysis("Exterior2nd")
nan_count("MasVnrType")
MasVnrType: Number of NaN values = 8
cat_analysis("MasVnrType")
train.MasVnrType.value_counts(dropna=False)
None 864 BrkFace 445 Stone 128 BrkCmn 15 NaN 8 Name: MasVnrType, dtype: int64
train.MasVnrType.fillna("None", inplace=True)
train.MasVnrType.value_counts(dropna=False)
None 872 BrkFace 445 Stone 128 BrkCmn 15 Name: MasVnrType, dtype: int64
cat_analysis("MasVnrType")
MasVnrType_dict = {}
for obj, key in enumerate(mean_reduction("MasVnrType").sort_values("mean").index):
MasVnrType_dict[key] = obj
------MEAN------- Total Mean = 180921.1959 Categ Mean = 193387.9517 Mean Reduction = -6.8907 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 72208.4307 Mean Reduction = 9.1060 %
MasVnrType_dict
{'BrkCmn': 0, 'None': 1, 'BrkFace': 2, 'Stone': 3}
train.MasVnrType = train.MasVnrType.map(MasVnrType_dict)
nan_count("MasVnrArea")
MasVnrArea: Number of NaN values = 8
train.MasVnrArea.describe()
count 1452.000000 mean 103.685262 std 181.066207 min 0.000000 25% 0.000000 50% 0.000000 75% 166.000000 max 1600.000000 Name: MasVnrArea, dtype: float64
train.MasVnrArea.fillna(0, inplace=True)
num_analysis("MasVnrArea")
train.loc[:,["MasVnrArea", "MasVnrType"]].loc[train.MasVnrArea == 0].MasVnrType.value_counts()
1 867 2 1 3 1 Name: MasVnrType, dtype: int64
train.loc[:,["MasVnrArea", "MasVnrType"]].loc[train.MasVnrArea != 0].MasVnrType.value_counts()
2 444 3 127 0 15 1 5 Name: MasVnrType, dtype: int64
nan_count("ExterQual")
ExterQual: Number of NaN values = 0
cat_analysis("ExterQual")
mean_reduction("ExterQual")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 207830.2499 Mean Reduction = -14.8734 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 67472.2181 Mean Reduction = 15.0679 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| ExterQual | ||||
| Ex | 367360.961538 | 116401.264200 | 0.728383 | 1.244714 |
| Fa | 87985.214286 | 39826.918794 | 1.559590 | 2.534499 |
| Gd | 231633.510246 | 71188.873899 | 1.793877 | 7.698168 |
| TA | 144341.313466 | 42471.815703 | 1.142810 | 3.590700 |
train.ExterQual = train.ExterQual.map(
{
"Fa": 0,
"TA": 1,
"Gd": 2,
"Ex": 3,
}
)
nan_count("ExterCond")
ExterCond: Number of NaN values = 0
cat_analysis("ExterCond")
nan_count("Foundation")
Foundation: Number of NaN values = 0
train.Foundation.value_counts()
PConc 647 CBlock 634 BrkTil 146 Slab 24 Stone 6 Wood 3 Name: Foundation, dtype: int64
cat_analysis("Foundation")
cat_analysis("Foundation", Violin=True)
Foundation_dict = {}
for obj, key in enumerate(mean_reduction("Foundation").sort_values("mean").index):
Foundation_dict[key] = obj
------MEAN------- Total Mean = 180921.1959 Categ Mean = 161053.1150 Mean Reduction = 10.9816 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 59870.0314 Mean Reduction = 24.6373 %
train.Foundation = train.Foundation.map(Foundation_dict)
Bsmt_index = [ _ for _ in [ _ for _ in train.columns if "Bsm" in _] if "SF" in _ ]
Bsmt_index
['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']
[ _ for _ in [ _ for _ in train.columns if "Bsm" in _] if "SF" not in _ ]
['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath']
train.BsmtQual.value_counts()
TA 649 Gd 618 Ex 121 Fa 35 Name: BsmtQual, dtype: int64
cat_analysis("BsmtQual")
train.BsmtQual = train.BsmtQual.map(
{
"Fa": 0,
"TA": 1,
"Gd": 2,
"Ex": 3,
}
)
train.BsmtQual.fillna(-1, inplace=True)
cat_analysis("BsmtQual")
train.BsmtCond.value_counts()
TA 1311 Gd 65 Fa 45 Po 2 Name: BsmtCond, dtype: int64
nan_count("Heating")
Heating: Number of NaN values = 0
train.Heating.value_counts(normalize=True).round(4)*100
GasA 97.81 GasW 1.23 Grav 0.48 Wall 0.27 OthW 0.14 Floor 0.07 Name: Heating, dtype: float64
cat_analysis("Heating")
train.BsmtExposure.value_counts()
No 953 Av 221 Gd 134 Mn 114 Name: BsmtExposure, dtype: int64
train.BsmtExposure = train.BsmtExposure.map(
{
"No": 0,
"Mn": 1,
"Av": 2,
"Gd": 3,
}
)
train.BsmtExposure.fillna(-1, inplace=True)
cat_analysis("BsmtExposure")
train.BsmtFinType1.value_counts(dropna=False)
Unf 430 GLQ 418 ALQ 220 BLQ 148 Rec 133 LwQ 74 NaN 37 Name: BsmtFinType1, dtype: int64
train.BsmtFinType1.value_counts(dropna=False, normalize=True).round(4) * 100
Unf 29.45 GLQ 28.63 ALQ 15.07 BLQ 10.14 Rec 9.11 LwQ 5.07 NaN 2.53 Name: BsmtFinType1, dtype: float64
cat_analysis("BsmtFinType1")
train.BsmtFinType1 = train.BsmtFinType1.map(
{
"Unf": 0,
"LwQ": 1,
"Rec": 2,
"BLQ": 3,
"ALQ": 4,
"GLQ": 5,
}
)
train.BsmtFinType1.fillna(-1, inplace=True)
cat_analysis("BsmtFinType1")
mean_reduction("BsmtFinType1")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 160220.8376 Mean Reduction = 11.4416 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 55601.0091 Mean Reduction = 30.0110 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| BsmtFinType1 | ||||
| -1.0 | 105652.891892 | 29278.694797 | 0.787447 | 1.884873 |
| 0.0 | 170670.576744 | 72577.080129 | 1.416616 | 3.810829 |
| 1.0 | 151852.702703 | 50796.987083 | 1.174375 | 1.089927 |
| 2.0 | 146889.248120 | 45190.118879 | 1.578908 | 6.064493 |
| 3.0 | 149493.655405 | 47327.292126 | 1.203884 | 2.704081 |
| 4.0 | 161573.068182 | 50821.277231 | 2.591656 | 13.840377 |
| 5.0 | 235413.720096 | 93215.613258 | 1.698872 | 5.090662 |
nan_count("BsmtFinType2")
BsmtFinType2: Number of NaN values = 38
train.BsmtFinType2.value_counts()
Unf 1256 Rec 54 LwQ 46 BLQ 33 ALQ 19 GLQ 14 Name: BsmtFinType2, dtype: int64
train.BsmtFinType2 = train.BsmtFinType2.map(
{
"Unf": 0,
"LwQ": 1,
"Rec": 2,
"BLQ": 3,
"ALQ": 4,
"GLQ": 5,
}
)
train.BsmtFinType2.fillna(-1, inplace=True)
cat_analysis("BsmtFinType2")
nan_count("HeatingQC")
HeatingQC: Number of NaN values = 0
train.HeatingQC.value_counts()
Ex 741 TA 428 Gd 241 Fa 49 Po 1 Name: HeatingQC, dtype: int64
cat_analysis("HeatingQC")
train.HeatingQC.unique()
HeatingQC_dict = {
"Ex": 0,
"Gd": 1,
"TA": 2,
"Fa": 3,
"Po": 3,
}
train.HeatingQC.map(HeatingQC_dict).value_counts(normalize=True).round(4) * 100
0 50.75 2 29.32 1 16.51 3 3.42 Name: HeatingQC, dtype: float64
train.HeatingQC = train.HeatingQC.map(HeatingQC_dict)
nan_count("CentralAir")
CentralAir: Number of NaN values = 0
train.CentralAir.value_counts(normalize=True) .round(4)* 100
Y 93.49 N 6.51 Name: CentralAir, dtype: float64
cat_analysis("CentralAir")
cat_analysis("CentralAir", Violin=True)
mean_reduction("CentralAir")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 145725.3918 Mean Reduction = 19.4537 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 59738.2404 Mean Reduction = 24.8032 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| CentralAir | ||||
| N | 105264.073684 | 40671.273961 | 1.169027 | 2.717054 |
| Y | 186186.709890 | 78805.206820 | 1.963899 | 6.807669 |
CentralAir_dict = {
"N": 0,
"Y": 1,
}
train.CentralAir = train.CentralAir.map(CentralAir_dict)
nan_count("Electrical")
Electrical: Number of NaN values = 1
train.Electrical.value_counts()
SBrkr 1334 FuseA 94 FuseF 27 FuseP 3 Mix 1 Name: Electrical, dtype: int64
train.Electrical.value_counts(normalize=True).round(4) * 100
SBrkr 91.43 FuseA 6.44 FuseF 1.85 FuseP 0.21 Mix 0.07 Name: Electrical, dtype: float64
cat_analysis("Electrical")
train.Electrical.value_counts()
SBrkr 1334 FuseA 94 FuseF 27 FuseP 3 Mix 1 Name: Electrical, dtype: int64
train.Electrical = train.Electrical.apply(lambda x: 1 if (x=="SBrkr") else 0)
cat_analysis("Electrical")
nan_count("BedroomAbvGr")
BedroomAbvGr: Number of NaN values = 0
train.BedroomAbvGr.value_counts()
3 804 2 358 4 213 1 50 5 21 6 7 0 6 8 1 Name: BedroomAbvGr, dtype: int64
cat_analysis("BedroomAbvGr")
train.BedroomAbvGr.value_counts(normalize=True).round(4) * 100
3 55.07 2 24.52 4 14.59 1 3.42 5 1.44 6 0.48 0 0.41 8 0.07 Name: BedroomAbvGr, dtype: float64
mean_reduction("BedroomAbvGr").sort_values("mean")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 184866.1772 Mean Reduction = -2.1805 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 79440.0528 Mean Reduction = 0.0031 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| BedroomAbvGr | ||||
| 6 | 143779.000000 | 26455.384839 | 1.510435 | 1.166394 |
| 2 | 158197.659218 | 78197.106241 | 2.093612 | 6.298546 |
| 1 | 173162.420000 | 102975.067584 | 1.184939 | 1.188272 |
| 5 | 180819.047619 | 68472.480743 | 0.502166 | -0.660556 |
| 3 | 181056.870647 | 63793.924903 | 1.217931 | 1.954024 |
| 8 | 200000.000000 | NaN | 0.000000 | -3.000000 |
| 4 | 220421.253521 | 109510.516383 | 1.941543 | 5.396161 |
| 0 | 221493.166667 | 106675.888935 | 0.425696 | -1.211711 |
train.BedroomAbvGr = train.BedroomAbvGr.map(
{
0: 1,
1: 1,
2: 2,
3: 3,
4: 4,
5: 5,
6: 5,
7: 5,
8: 5,
}
)
cat_analysis("BedroomAbvGr")
mean_reduction("BedroomAbvGr")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 182111.2513 Mean Reduction = -0.6578 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 83300.1744 Mean Reduction = -4.8559 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| BedroomAbvGr | ||||
| 1 | 178340.714286 | 103484.880803 | 1.074242 | 0.781505 |
| 2 | 158197.659218 | 78197.106241 | 2.093612 | 6.298546 |
| 3 | 181056.870647 | 63793.924903 | 1.217931 | 1.954024 |
| 4 | 220421.253521 | 109510.516383 | 1.941543 | 5.396161 |
| 5 | 172539.758621 | 61514.443533 | 0.801759 | -0.051893 |
nan_count("KitchenAbvGr")
KitchenAbvGr: Number of NaN values = 0
train.KitchenAbvGr.value_counts()
1 1392 2 65 3 2 0 1 Name: KitchenAbvGr, dtype: int64
train.KitchenAbvGr.value_counts(normalize=True).round(4) * 100
1 95.34 2 4.45 3 0.14 0 0.07 Name: KitchenAbvGr, dtype: float64
cat_analysis("KitchenAbvGr")
nan_count("KitchenQual")
KitchenQual: Number of NaN values = 0
cat_analysis("KitchenQual")
train.KitchenQual.value_counts()
TA 735 Gd 586 Ex 100 Fa 39 Name: KitchenQual, dtype: int64
train.KitchenQual = train.KitchenQual.map(
{
"Fa": 0,
"TA": 1,
"Gd": 2,
"Ex": 3
}
)
cat_analysis("KitchenQual")
mean_reduction("KitchenQual")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 196549.6026 Mean Reduction = -8.6382 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 64945.9134 Mean Reduction = 18.2479 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| KitchenQual | ||||
| 0 | 105565.205128 | 36004.254037 | 0.232863 | -0.255302 |
| 1 | 139962.511565 | 38896.280336 | 0.996612 | 3.652486 |
| 2 | 212116.023891 | 64020.176702 | 1.185759 | 3.366636 |
| 3 | 328554.670000 | 120862.942573 | 0.922703 | 1.820550 |
nan_count("TotRmsAbvGrd")
TotRmsAbvGrd: Number of NaN values = 0
cat_analysis("TotRmsAbvGrd")
train.TotRmsAbvGrd.value_counts().sort_index()
2 1 3 17 4 97 5 275 6 402 7 329 8 187 9 75 10 47 11 18 12 11 14 1 Name: TotRmsAbvGrd, dtype: int64
train.TotRmsAbvGrd = train.TotRmsAbvGrd.map(
{
1 : 3,
2 : 3,
3 : 3,
4 : 4,
5 : 5,
6 : 6,
7 : 7,
8 : 8,
9 : 9,
10:10,
11:11,
12:12,
13:12,
14:12,
}
)
cat_analysis("TotRmsAbvGrd")
mean_reduction("TotRmsAbvGrd")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 208452.8387 Mean Reduction = -15.2175 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 86064.1715 Mean Reduction = -8.3352 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| TotRmsAbvGrd | ||||
| 3 | 107222.222222 | 36318.161970 | -0.117054 | -1.310909 |
| 4 | 122844.628866 | 41661.805370 | 0.760222 | 1.264673 |
| 5 | 141550.749091 | 49656.905700 | 1.890533 | 5.508219 |
| 6 | 161303.296020 | 48707.372941 | 0.959624 | 2.163478 |
| 7 | 196666.784195 | 64404.686026 | 1.019251 | 1.600607 |
| 8 | 213427.529412 | 73985.715639 | 0.858471 | 0.499570 |
| 9 | 252988.173333 | 83131.922273 | 0.331793 | 0.089782 |
| 10 | 296279.170213 | 155027.992122 | 1.144008 | 1.349232 |
| 11 | 318022.000000 | 148216.975202 | 0.294642 | -0.686301 |
| 12 | 274223.833333 | 159530.177676 | 1.124890 | -0.489046 |
nan_count("Fireplaces")
Fireplaces: Number of NaN values = 0
train.Fireplaces.value_counts()
0 690 1 650 2 115 3 5 Name: Fireplaces, dtype: int64
train.Fireplaces.value_counts(normalize=True).round(4) * 100
0 47.26 1 44.52 2 7.88 3 0.34 Name: Fireplaces, dtype: float64
cat_analysis("Fireplaces")
train.Fireplaces = train.Fireplaces.apply(lambda x: 2 if (x > 2) else x)
train.Fireplaces.value_counts()
0 690 1 650 2 120 Name: Fireplaces, dtype: int64
train.Fireplaces.value_counts(normalize=True).round(4) * 100
0 47.26 1 44.52 2 8.22 Name: Fireplaces, dtype: float64
cat_analysis("Fireplaces")
mean_reduction("Fireplaces")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 198079.8028 Mean Reduction = -9.4840 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 80056.0239 Mean Reduction = -0.7723 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| Fireplaces | ||||
| 0 | 141331.482609 | 44389.786884 | 0.763780 | 1.245669 |
| 1 | 211843.909231 | 79531.825318 | 1.339179 | 2.674325 |
| 2 | 241064.016667 | 116246.459371 | 2.043046 | 5.374948 |
nan_count("FireplaceQu")
FireplaceQu: Number of NaN values = 690
train.FireplaceQu.value_counts(dropna=False)
NaN 690 Gd 380 TA 313 Fa 33 Ex 24 Po 20 Name: FireplaceQu, dtype: int64
train.FireplaceQu.value_counts(normalize=True, dropna=False).round(5) * 100
NaN 47.260 Gd 26.027 TA 21.438 Fa 2.260 Ex 1.644 Po 1.370 Name: FireplaceQu, dtype: float64
cat_analysis("FireplaceQu")
train.FireplaceQu = train.FireplaceQu.fillna("NA")
cat_analysis("FireplaceQu")
mean_reduction("FireplaceQu").sort_values("mean")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 201363.5870 Mean Reduction = -11.2991 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 65976.6659 Mean Reduction = 16.9504 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| FireplaceQu | ||||
| Po | 129764.150000 | 31080.842115 | -0.831140 | 0.278500 |
| NA | 141331.482609 | 44389.786884 | 0.763780 | 1.245669 |
| Fa | 167298.484848 | 34287.717564 | 0.894086 | 0.373925 |
| TA | 205723.488818 | 71367.028772 | 2.585307 | 13.584703 |
| Gd | 226351.415789 | 91123.148369 | 1.197213 | 1.767435 |
| Ex | 337712.500000 | 123611.471764 | 1.386957 | 3.554678 |
train.FireplaceQu = train.FireplaceQu.map(
{
"Po": 0,
"NA": 1,
"Fa": 2,
"TA": 3,
"Gd": 4,
"Ex": 5,
}
)
cat_analysis("FireplaceQu")
nan_count("GarageType")
GarageType: Number of NaN values = 81
train.GarageType.fillna("NA", inplace=True)
cat_analysis("GarageType")
mean_reduction("GarageType").sort_values("mean")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 159552.7101 Mean Reduction = 11.8109 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 53872.3693 Mean Reduction = 32.1870 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| GarageType | ||||
| NA | 103317.283951 | 32815.023389 | 0.482578 | 0.701521 |
| CarPort | 109962.111111 | 24638.183681 | 1.232117 | 0.841136 |
| Detchd | 134091.162791 | 41391.549953 | 2.400113 | 13.712168 |
| 2Types | 151283.333333 | 34917.068415 | -1.076278 | 0.048402 |
| Basment | 160570.684211 | 63967.283574 | 1.441136 | 3.171987 |
| Attchd | 202892.656322 | 77146.621778 | 2.032313 | 8.009147 |
| BuiltIn | 254751.738636 | 102230.854394 | 1.133224 | 1.163546 |
GarageType_dict = {}
for obj, key in enumerate(mean_reduction("GarageType").sort_values("mean").index):
GarageType_dict[key] = obj
print("GarageType_dict = ", GarageType_dict)
------MEAN-------
Total Mean = 180921.1959
Categ Mean = 159552.7101
Mean Reduction = 11.8109 %
------STD--------
Total Mean = 79442.5029
Categ Mean = 53872.3693
Mean Reduction = 32.1870 %
GarageType_dict = {'NA': 0, 'CarPort': 1, 'Detchd': 2, '2Types': 3, 'Basment': 4, 'Attchd': 5, 'BuiltIn': 6}
train.GarageType = train.GarageType.map(GarageType_dict)
cat_analysis("GarageType")
nan_count("GarageYrBlt")
GarageYrBlt: Number of NaN values = 81
num_analysis("GarageYrBlt", "SalePrice")
train["Age_Garage"] = train.YrSold - train.GarageYrBlt
num_analysis("Age_Garage", "SalePrice")
nan_count("GarageFinish")
GarageFinish: Number of NaN values = 81
train.GarageFinish.value_counts(dropna=False)
Unf 605 RFn 422 Fin 352 NaN 81 Name: GarageFinish, dtype: int64
train.GarageFinish.fillna("NA", inplace=True)
cat_analysis("GarageFinish")
mean_reduction("GarageFinish").sort_values("mean")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 171898.8168 Mean Reduction = 4.9869 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 59952.5918 Mean Reduction = 24.5334 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| GarageFinish | ||||
| NA | 103317.283951 | 32815.023389 | 0.482578 | 0.701521 |
| Unf | 142156.423140 | 46498.528211 | 1.964609 | 7.467639 |
| RFn | 202068.869668 | 63536.222518 | 1.547033 | 4.583898 |
| Fin | 240052.690341 | 96960.593232 | 1.762373 | 4.911414 |
train.GarageFinish = train.GarageFinish.map(
{
"NA" : 0,
"Unf": 1,
"RFn": 2,
"Fin": 3,
}
)
cat_analysis("GarageFinish")
nan_count("GarageCars")
GarageCars: Number of NaN values = 0
train.GarageCars.value_counts()
2 824 1 369 3 181 0 81 4 5 Name: GarageCars, dtype: int64
cat_analysis("GarageCars")
train.GarageCars = train.GarageCars.apply(lambda x: 3 if (x >= 3) else x)
cat_analysis("GarageCars")
mean_reduction("GarageCars")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 180444.2813 Mean Reduction = 0.2636 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 55549.2937 Mean Reduction = 30.0761 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| GarageCars | ||||
| 0 | 103317.283951 | 32815.023389 | 0.482578 | 0.701521 |
| 1 | 128116.688347 | 30412.386890 | 0.801926 | 3.337260 |
| 2 | 183851.663835 | 51617.144258 | 0.863292 | 1.425320 |
| 3 | 306491.489247 | 107352.620315 | 1.067008 | 2.733433 |
nan_count("GarageArea")
GarageArea: Number of NaN values = 0
train.GarageArea.describe()
count 1460.000000 mean 472.980137 std 213.804841 min 0.000000 25% 334.500000 50% 480.000000 75% 576.000000 max 1418.000000 Name: GarageArea, dtype: float64
num_analysis("GarageArea")
num_analysis("GarageArea", "SalePrice")
nan_count("GarageQual")
GarageQual: Number of NaN values = 81
train.GarageQual.fillna("NA", inplace=True)
train.GarageQual.value_counts()
TA 1311 NA 81 Fa 48 Gd 14 Po 3 Ex 3 Name: GarageQual, dtype: int64
cat_analysis("GarageQual")
train.GarageQual = train.GarageQual.map(
{
"Po": 0,
"NA": 0,
"Fa": 1,
"TA": 2,
"Gd": 3,
"Ex": 3,
}
)
cat_analysis("GarageQual")
mean_reduction("GarageQual")
------MEAN------- Total Mean = 180921.1959 Categ Mean = 158641.2527 Mean Reduction = 12.3147 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 63225.0390 Mean Reduction = 20.4141 %
| mean | std | skew | kurtosis | |
|---|---|---|---|---|
| GarageQual | ||||
| 0 | 103204.761905 | 32680.443121 | 0.477718 | 0.664171 |
| 1 | 123573.354167 | 42971.440985 | 1.553764 | 2.396546 |
| 2 | 187489.836003 | 78774.948567 | 1.966493 | 6.927358 |
| 3 | 220297.058824 | 98473.323213 | 0.921633 | 0.583568 |
nan_count("GarageCond")
GarageCond: Number of NaN values = 81
train.GarageCond.fillna("NA", inplace=True)
train.GarageCond.value_counts()
TA 1326 NA 81 Fa 35 Gd 9 Po 7 Ex 2 Name: GarageCond, dtype: int64
cat_analysis("GarageCond")
train.GarageCond = train.GarageCond.map(
{
"Po": 0,
"NA": 0,
"Fa": 1,
"TA": 2,
"Gd": 3,
"Ex": 3
}
)
cat_analysis("GarageCond")
train.GarageCond.value_counts()
2 1326 0 88 1 35 3 11 Name: GarageCond, dtype: int64
nan_count("PavedDrive")
PavedDrive: Number of NaN values = 0
train.PavedDrive.value_counts()
Y 1340 N 90 P 30 Name: PavedDrive, dtype: int64
train.PavedDrive.value_counts(normalize=True).round(4) * 100
Y 91.78 N 6.16 P 2.05 Name: PavedDrive, dtype: float64
cat_analysis("PavedDrive")
PavedDrive_dict = {}
for obj, key in enumerate(mean_reduction("PavedDrive").sort_values("mean").index):
PavedDrive_dict[key] = obj
------MEAN------- Total Mean = 180921.1959 Categ Mean = 144601.0320 Mean Reduction = 20.0751 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 52507.0189 Mean Reduction = 33.9056 %
train.PavedDrive = train.PavedDrive.map(PavedDrive_dict)
nan_count("PoolArea")
PoolArea: Number of NaN values = 0
train.PoolArea.describe()
count 1460.000000 mean 2.758904 std 40.177307 min 0.000000 25% 0.000000 50% 0.000000 75% 0.000000 max 738.000000 Name: PoolArea, dtype: float64
num_analysis("PoolArea")
(train.PoolArea != 0).sum()
7
train.PoolQC.value_counts(dropna=False)
NaN 1453 Gd 3 Ex 2 Fa 2 Name: PoolQC, dtype: int64
train.Fence.value_counts(dropna=False)
NaN 1179 MnPrv 157 GdPrv 59 GdWo 54 MnWw 11 Name: Fence, dtype: int64
sns.heatmap(train.loc[:,["Fence"]].isna())
<AxesSubplot:>
train.Fence.fillna("NA", inplace=True)
cat_analysis("Fence")
train.MiscFeature.value_counts(dropna=False)
NaN 1406 Shed 49 Gar2 2 Othr 2 TenC 1 Name: MiscFeature, dtype: int64
train.MiscFeature.value_counts(dropna=False, normalize=True).round(4) * 100
NaN 96.30 Shed 3.36 Gar2 0.14 Othr 0.14 TenC 0.07 Name: MiscFeature, dtype: float64
train.MiscVal.value_counts(dropna=False)
0 1408 400 11 500 8 700 5 450 4 600 4 2000 4 480 2 1200 2 2500 1 350 1 560 1 54 1 620 1 8300 1 800 1 1150 1 15500 1 1300 1 1400 1 3500 1 Name: MiscVal, dtype: int64
train.MiscVal.value_counts(dropna=False, normalize=True).round(4) * 100
0 96.44 400 0.75 500 0.55 700 0.34 450 0.27 600 0.27 2000 0.27 480 0.14 1200 0.14 2500 0.07 350 0.07 560 0.07 54 0.07 620 0.07 8300 0.07 800 0.07 1150 0.07 15500 0.07 1300 0.07 1400 0.07 3500 0.07 Name: MiscVal, dtype: float64
train.SaleType.value_counts(dropna=False)
WD 1267 New 122 COD 43 ConLD 9 ConLI 5 ConLw 5 CWD 4 Oth 3 Con 2 Name: SaleType, dtype: int64
train.SaleType.value_counts(normalize=True, dropna=False).round(4) * 100
WD 86.78 New 8.36 COD 2.95 ConLD 0.62 ConLI 0.34 ConLw 0.34 CWD 0.27 Oth 0.21 Con 0.14 Name: SaleType, dtype: float64
train.SaleType = train.SaleType.apply( lambda x: 0 if (x == "WD") else 1 )
cat_analysis("SaleType")
train.SaleCondition.value_counts(dropna=False)
Normal 1198 Partial 125 Abnorml 101 Family 20 Alloca 12 AdjLand 4 Name: SaleCondition, dtype: int64
cat_analysis("SaleCondition")
SaleCondition_dict = {}
for obj, key in enumerate(mean_reduction("SaleCondition").sort_values("mean").index):
SaleCondition_dict[key] = obj
------MEAN------- Total Mean = 180921.1959 Categ Mean = 169187.1687 Mean Reduction = 6.4857 % ------STD-------- Total Mean = 79442.5029 Categ Mean = 69103.7080 Mean Reduction = 13.0142 %
train.SaleCondition = train.SaleCondition.map(SaleCondition_dict)
nan_count("GrLivArea")
GrLivArea: Number of NaN values = 0
num_analysis("GrLivArea")
num_analysis("GrLivArea", "SalePrice")
train.MoSold.value_counts(dropna=False).sort_index()
1 58 2 52 3 106 4 141 5 204 6 253 7 234 8 122 9 63 10 89 11 79 12 59 Name: MoSold, dtype: int64
cat_analysis("MoSold")
train.Functional.value_counts(dropna=True)
Typ 1360 Min2 34 Min1 31 Mod 15 Maj1 14 Maj2 5 Sev 1 Name: Functional, dtype: int64
train.Functional = train.Functional.map(
{
"Sal" : 0,
"Sev" : 1,
"Maj2": 2,
"Maj1": 3,
"Mod" : 4,
"Min2": 5,
"Min1": 6,
"Typ" : 7,
}
)
train["Floors_no"] = (train.loc[:,["1stFlrSF", "2ndFlrSF", "LowQualFinSF"]] != 0).sum(axis=1)
[ _ for _ in train.columns if "SF" in _ ]
['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'WoodDeckSF', 'OpenPorchSF', 'Total_SF']
train.drop(
labels =
[
"Utilities","Condition2","YearBuilt","RoofMatl","ExterCond","Heating","KitchenAbvGr","PoolArea","Fence",
"MiscFeature","Street","Alley","PoolQC","LandSlope", "BsmtCond", "Id", "BsmtFinSF2","BsmtUnfSF",'1stFlrSF',
'2ndFlrSF','WoodDeckSF','OpenPorchSF','EnclosedPorch',"Half_bath","Full_bath",'BsmtFullBath','BsmtHalfBath',
'FullBath','HalfBath',
],
axis = 1,
inplace = True
)
train.select_dtypes("object")
| 0 |
|---|
| 1 |
| 2 |
| 3 |
| 4 |
| ... |
| 1455 |
| 1456 |
| 1457 |
| 1458 |
| 1459 |
1460 rows × 0 columns
train.corr().loc[:,"SalePrice"].sort_values(ascending=False) * 100
SalePrice 100.000000 log_SalePrice 94.837373 Total_SF 81.008777 OverallQual 79.098160 GrLivArea 70.862448 Neighborhood 69.688224 ExterQual 68.263924 KitchenQual 65.959972 GarageCars 64.665180 GarageArea 62.343144 BsmtQual 62.292473 TotalBsmtSF 61.358055 All_bath 61.300461 GarageFinish 54.924676 TotRmsAbvGrd 53.513181 FireplaceQu 52.593236 YearRemodAdd 50.710097 Foundation 50.550317 GarageType 48.912999 GarageYrBlt 48.636168 MSSubClass 47.698243 MasVnrArea 47.261450 Fireplaces 46.954320 MasVnrType 42.369742 BsmtFinSF1 38.641981 BsmtExposure 37.469622 Exterior1st 36.022398 LotFrontage 35.179910 MSZoning 32.460953 BsmtFinType1 30.490787 GarageQual 28.084837 HouseStyle 27.307537 GarageCond 27.016925 LotShape 26.569896 LotArea 26.384335 CentralAir 25.132816 SaleType 24.259752 Electrical 24.189605 RoofStyle 23.977726 SaleCondition 23.811114 PavedDrive 23.135695 BedroomAbvGr 18.125061 BldgType 17.602476 Condition1 17.239779 LandContour 15.843700 Road 14.852829 Exterior2nd 14.133754 LotConfig 12.288236 Floors_no 11.916439 ScreenPorch 11.144657 Functional 10.761889 MoSold 4.643225 3SsnPorch 4.458367 BsmtFinType2 -0.532316 MiscVal -2.118958 Remodel_Done -2.193260 LowQualFinSF -2.560613 YrSold -2.892259 OverallCond -7.785589 HeatingQC -42.764730 Age_Garage -48.706859 Remodel_Age -50.907874 Age_At_Sale -52.335042 Name: SalePrice, dtype: float64
%%time
fig, ax = plt.subplots(1,1, figsize=(18,18))
sns.heatmap(train.corr(), square=True, annot=False)
plt.show()
Wall time: 7.54 s
train.corr().loc[:,"SalePrice"].abs().sort_values(ascending=False).nlargest(11)
SalePrice 1.000000 log_SalePrice 0.948374 Total_SF 0.810088 OverallQual 0.790982 GrLivArea 0.708624 Neighborhood 0.696882 ExterQual 0.682639 KitchenQual 0.659600 GarageCars 0.646652 GarageArea 0.623431 BsmtQual 0.622925 Name: SalePrice, dtype: float64
%%time
fig, ax = plt.subplots(1,1, figsize=(15,15))
sns.heatmap(train.loc[:,train.corr().loc[:,"SalePrice"].abs().sort_values(ascending=False).nlargest(11).index].corr(), square=True, annot=True)
plt.show()
Wall time: 2.54 s
train.loc[:,train.corr().loc[:,"SalePrice"].abs().sort_values(ascending=False).nlargest(11).index]
| SalePrice | log_SalePrice | Total_SF | OverallQual | GrLivArea | Neighborhood | ExterQual | KitchenQual | GarageCars | GarageArea | BsmtQual | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 208500 | 12.247694 | 3175 | 7 | 1710 | 16 | 2 | 2 | 2 | 548 | 2.0 |
| 1 | 181500 | 12.109011 | 3282 | 6 | 1262 | 20 | 1 | 1 | 2 | 460 | 2.0 |
| 2 | 223500 | 12.317167 | 3356 | 7 | 1786 | 16 | 2 | 2 | 2 | 608 | 2.0 |
| 3 | 140000 | 11.849398 | 3422 | 7 | 1717 | 17 | 1 | 2 | 3 | 642 | 1.0 |
| 4 | 250000 | 12.429216 | 4455 | 8 | 2198 | 24 | 2 | 2 | 3 | 836 | 2.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | 175000 | 12.072541 | 3100 | 6 | 1647 | 14 | 1 | 1 | 2 | 460 | 2.0 |
| 1456 | 210000 | 12.254863 | 4464 | 6 | 2073 | 13 | 1 | 1 | 2 | 500 | 2.0 |
| 1457 | 266500 | 12.493130 | 3804 | 7 | 2340 | 17 | 3 | 2 | 1 | 252 | 1.0 |
| 1458 | 142125 | 11.864462 | 2874 | 5 | 1078 | 10 | 1 | 2 | 1 | 240 | 1.0 |
| 1459 | 147500 | 11.901583 | 3592 | 5 | 1256 | 4 | 2 | 1 | 1 | 276 | 1.0 |
1460 rows × 11 columns
high_corr_field = list(train.corr().loc[:,"SalePrice"].abs().sort_values(ascending=False).nlargest(12).index)[2:]; high_corr_field
['Total_SF', 'OverallQual', 'GrLivArea', 'Neighborhood', 'ExterQual', 'KitchenQual', 'GarageCars', 'GarageArea', 'BsmtQual', 'TotalBsmtSF']
train.isna().sum()[ train.isna().sum() > 0 ]
LotFrontage 259 GarageYrBlt 81 Age_Garage 81 dtype: int64
%%time
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler, StandardScaler, QuantileTransformer
from sklearn.pipeline import Pipeline
QT = QuantileTransformer(output_distribution='normal')
KI = KNNImputer()
y_train = train.SalePrice
x_train = train.drop(labels=["SalePrice", "log_SalePrice"], axis=1)
train_QT = pd.DataFrame(QT.fit_transform(x_train), columns=x_train.columns, index=x_train.index)
x_train_impute = KI.fit_transform(train_QT)
x_train_filled = pd.DataFrame(QT.inverse_transform(x_train_impute), columns=x_train.columns, index=x_train.index)
Wall time: 473 ms
x_train_filled
| MSSubClass | MSZoning | LotFrontage | LotArea | LotShape | LandContour | LotConfig | Neighborhood | Condition1 | BldgType | HouseStyle | OverallQual | OverallCond | YearRemodAdd | RoofStyle | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | Foundation | BsmtQual | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | TotalBsmtSF | HeatingQC | CentralAir | Electrical | LowQualFinSF | GrLivArea | BedroomAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | 3SsnPorch | ScreenPorch | MiscVal | MoSold | YrSold | SaleType | SaleCondition | Total_SF | Age_At_Sale | Remodel_Age | Remodel_Done | All_bath | Road | Age_Garage | Floors_no | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14.0 | 3.0 | 65.0 | 8450.0 | 0.0 | 1.0 | 0.0 | 16.0 | 4.0 | 4.0 | 6.0 | 7.0 | 5.0 | 2003.0 | 1.0 | 11.0 | 4.0 | 2.0 | 196.0 | 2.0 | 5.0 | 2.0 | 0.0 | 5.0 | 706.0 | 0.0 | 856.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1710.0 | 3.0 | 2.0 | 8.0 | 7.0 | 0.0 | 1.0 | 5.0 | 2003.0 | 2.0 | 2.0 | 548.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 2.0 | 2008.0 | 0.0 | 4.0 | 3175.0 | 5.0 | 5.0 | 0.0 | 4.0 | 3.0 | 5.0 | 2.0 |
| 1 | 11.0 | 3.0 | 80.0 | 9600.0 | 0.0 | 1.0 | 1.0 | 20.0 | 2.0 | 4.0 | 5.0 | 6.0 | 8.0 | 1976.0 | 1.0 | 4.0 | 1.0 | 1.0 | 0.0 | 1.0 | 2.0 | 2.0 | 3.0 | 4.0 | 978.0 | 0.0 | 1262.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1262.0 | 3.0 | 1.0 | 6.0 | 7.0 | 1.0 | 3.0 | 5.0 | 1976.0 | 2.0 | 2.0 | 460.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 5.0 | 2007.0 | 0.0 | 4.0 | 3282.0 | 31.0 | 31.0 | 0.0 | 3.0 | 3.0 | 31.0 | 1.0 |
| 2 | 14.0 | 3.0 | 68.0 | 11250.0 | 1.0 | 1.0 | 0.0 | 16.0 | 4.0 | 4.0 | 6.0 | 7.0 | 5.0 | 2002.0 | 1.0 | 11.0 | 4.0 | 2.0 | 162.0 | 2.0 | 5.0 | 2.0 | 1.0 | 5.0 | 486.0 | 0.0 | 920.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1786.0 | 3.0 | 2.0 | 6.0 | 7.0 | 1.0 | 3.0 | 5.0 | 2001.0 | 2.0 | 2.0 | 608.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 9.0 | 2008.0 | 0.0 | 4.0 | 3356.0 | 7.0 | 6.0 | 1.0 | 4.0 | 3.0 | 7.0 | 2.0 |
| 3 | 9.0 | 3.0 | 60.0 | 9550.0 | 1.0 | 1.0 | 2.0 | 17.0 | 4.0 | 4.0 | 6.0 | 7.0 | 5.0 | 1970.0 | 1.0 | 5.0 | 3.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 4.0 | 216.0 | 0.0 | 756.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1717.0 | 3.0 | 2.0 | 7.0 | 7.0 | 1.0 | 4.0 | 2.0 | 1998.0 | 1.0 | 3.0 | 642.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 2.0 | 2006.0 | 0.0 | 1.0 | 3422.0 | 91.0 | 36.0 | 1.0 | 2.0 | 3.0 | 8.0 | 2.0 |
| 4 | 14.0 | 3.0 | 84.0 | 14260.0 | 1.0 | 1.0 | 1.0 | 24.0 | 4.0 | 4.0 | 6.0 | 8.0 | 5.0 | 2000.0 | 1.0 | 11.0 | 4.0 | 2.0 | 350.0 | 2.0 | 5.0 | 2.0 | 2.0 | 5.0 | 655.0 | 0.0 | 1145.0 | 0.0 | 1.0 | 1.0 | 0.0 | 2198.0 | 4.0 | 2.0 | 9.0 | 7.0 | 1.0 | 3.0 | 5.0 | 2000.0 | 2.0 | 3.0 | 836.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 12.0 | 2008.0 | 0.0 | 4.0 | 4455.0 | 8.0 | 8.0 | 0.0 | 4.0 | 3.0 | 8.0 | 2.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | 14.0 | 3.0 | 62.0 | 7917.0 | 0.0 | 1.0 | 0.0 | 14.0 | 4.0 | 4.0 | 6.0 | 6.0 | 5.0 | 2000.0 | 1.0 | 11.0 | 4.0 | 1.0 | 0.0 | 1.0 | 5.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 953.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1647.0 | 3.0 | 1.0 | 7.0 | 7.0 | 1.0 | 3.0 | 5.0 | 1999.0 | 2.0 | 2.0 | 460.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 8.0 | 2007.0 | 0.0 | 4.0 | 3100.0 | 8.0 | 7.0 | 1.0 | 3.0 | 3.0 | 8.0 | 2.0 |
| 1456 | 11.0 | 3.0 | 85.0 | 13175.0 | 0.0 | 1.0 | 0.0 | 13.0 | 4.0 | 4.0 | 5.0 | 6.0 | 6.0 | 1988.0 | 1.0 | 9.0 | 2.0 | 3.0 | 119.0 | 1.0 | 2.0 | 2.0 | 0.0 | 4.0 | 790.0 | 2.0 | 1542.0 | 2.0 | 1.0 | 1.0 | 0.0 | 2073.0 | 3.0 | 1.0 | 7.0 | 6.0 | 2.0 | 3.0 | 5.0 | 1978.0 | 1.0 | 2.0 | 500.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 2.0 | 2010.0 | 0.0 | 4.0 | 4464.0 | 32.0 | 22.0 | 1.0 | 3.0 | 3.0 | 32.0 | 1.0 |
| 1457 | 9.0 | 3.0 | 66.0 | 9042.0 | 0.0 | 1.0 | 0.0 | 17.0 | 4.0 | 4.0 | 6.0 | 7.0 | 9.0 | 2006.0 | 1.0 | 12.0 | 3.0 | 1.0 | 0.0 | 3.0 | 3.0 | 1.0 | 0.0 | 5.0 | 275.0 | 0.0 | 1152.0 | 0.0 | 1.0 | 1.0 | 0.0 | 2340.0 | 4.0 | 2.0 | 9.0 | 7.0 | 2.0 | 4.0 | 5.0 | 1941.0 | 2.0 | 1.0 | 252.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 2500.0 | 5.0 | 2010.0 | 0.0 | 4.0 | 3804.0 | 69.0 | 4.0 | 1.0 | 2.0 | 3.0 | 69.0 | 2.0 |
| 1458 | 11.0 | 3.0 | 68.0 | 9717.0 | 0.0 | 1.0 | 0.0 | 10.0 | 4.0 | 4.0 | 5.0 | 5.0 | 6.0 | 1996.0 | 4.0 | 4.0 | 1.0 | 1.0 | 0.0 | 1.0 | 2.0 | 1.0 | 1.0 | 5.0 | 49.0 | 2.0 | 1078.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1078.0 | 2.0 | 2.0 | 5.0 | 7.0 | 0.0 | 1.0 | 5.0 | 1950.0 | 1.0 | 1.0 | 240.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 4.0 | 2010.0 | 0.0 | 4.0 | 2874.0 | 60.0 | 14.0 | 1.0 | 2.0 | 3.0 | 60.0 | 1.0 |
| 1459 | 11.0 | 3.0 | 75.0 | 9937.0 | 0.0 | 1.0 | 0.0 | 4.0 | 4.0 | 4.0 | 5.0 | 5.0 | 6.0 | 1965.0 | 1.0 | 8.0 | 0.0 | 1.0 | 0.0 | 2.0 | 2.0 | 1.0 | 0.0 | 3.0 | 830.0 | 1.0 | 1256.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1256.0 | 3.0 | 1.0 | 6.0 | 7.0 | 0.0 | 1.0 | 5.0 | 1965.0 | 3.0 | 1.0 | 276.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 6.0 | 2008.0 | 0.0 | 4.0 | 3592.0 | 43.0 | 43.0 | 0.0 | 3.0 | 3.0 | 43.0 | 1.0 |
1460 rows × 61 columns
from sklearn.model_selection import RepeatedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor, plot_tree
cv = RepeatedKFold(random_state=1991)
dt = DecisionTreeRegressor()
%%time
dt.fit(x_train_filled, y_train)
Wall time: 44 ms
DecisionTreeRegressor()
dt.get_depth()
23
dt.get_n_leaves()
1404
%%time
fig, ax = plt.subplots(1,1, figsize=(18,18))
plot_tree(dt, ax=ax, filled=True, feature_names=list(x_train_filled.columns), max_depth=2, fontsize=10)
plt.show()
Wall time: 1.11 s
from sklearn.metrics import mean_absolute_error, mean_squared_error
param_dict = {
"criterion": ["mse", "friedman_mse", "poisson"],
"splitter" : ["best", "random"]
}
dt_gscv = GridSearchCV(
DecisionTreeRegressor(),
param_dict,
scoring = "neg_mean_absolute_error",
n_jobs = -2,
cv = cv,
verbose = 1,
)
dt_gscv.fit(x_train_filled, y_train)
Fitting 50 folds for each of 6 candidates, totalling 300 fits
GridSearchCV(cv=RepeatedKFold(n_repeats=10, n_splits=5, random_state=1991),
estimator=DecisionTreeRegressor(), n_jobs=-2,
param_grid={'criterion': ['mse', 'friedman_mse', 'poisson'],
'splitter': ['best', 'random']},
scoring='neg_mean_absolute_error', verbose=1)
pd.DataFrame(dt_gscv.cv_results_).sort_values("rank_test_score").head(7)
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_criterion | param_splitter | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | split5_test_score | split6_test_score | split7_test_score | split8_test_score | split9_test_score | split10_test_score | split11_test_score | split12_test_score | split13_test_score | split14_test_score | split15_test_score | split16_test_score | split17_test_score | split18_test_score | split19_test_score | split20_test_score | split21_test_score | split22_test_score | split23_test_score | split24_test_score | split25_test_score | split26_test_score | split27_test_score | split28_test_score | split29_test_score | split30_test_score | split31_test_score | split32_test_score | split33_test_score | split34_test_score | split35_test_score | split36_test_score | split37_test_score | split38_test_score | split39_test_score | split40_test_score | split41_test_score | split42_test_score | split43_test_score | split44_test_score | split45_test_score | split46_test_score | split47_test_score | split48_test_score | split49_test_score | mean_test_score | std_test_score | rank_test_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 0.081292 | 0.013399 | 0.008676 | 0.005428 | friedman_mse | best | {'criterion': 'friedman_mse', 'splitter': 'best'} | -25596.496575 | -26228.191781 | -24750.578767 | -21386.801370 | -23029.616438 | -24946.465753 | -21219.547945 | -27084.671233 | -26131.099315 | -25369.917808 | -24040.958904 | -25480.633562 | -27050.989726 | -26152.448630 | -29563.910959 | -21941.773973 | -23979.818493 | -27542.325342 | -21926.171233 | -29399.287671 | -25601.270548 | -24503.123288 | -22689.315068 | -26149.989726 | -26783.232877 | -26340.958904 | -23894.496575 | -25843.643836 | -22701.219178 | -23553.417808 | -25828.219178 | -22355.914384 | -23789.708904 | -28244.883562 | -23674.006849 | -22000.397260 | -25777.568493 | -24142.698630 | -26798.969178 | -26156.582192 | -26943.448630 | -23118.602740 | -22696.595890 | -27276.424658 | -25093.178082 | -25769.445205 | -22634.763699 | -25271.363014 | -26632.753425 | -23510.664384 | -24971.971233 | 1994.554069 | 1 |
| 0 | 0.070118 | 0.011782 | 0.006198 | 0.001183 | mse | best | {'criterion': 'mse', 'splitter': 'best'} | -25798.558219 | -26966.664384 | -23342.856164 | -22138.119863 | -23010.565068 | -26742.705479 | -21692.753425 | -26411.434932 | -26124.071918 | -25209.113014 | -25233.883562 | -24578.373288 | -25393.020548 | -26432.715753 | -28980.486301 | -20985.537671 | -23978.743151 | -25769.260274 | -21595.674658 | -27069.294521 | -24723.705479 | -24927.794521 | -23168.688356 | -25532.554795 | -26788.561644 | -26619.202055 | -24634.140411 | -26437.219178 | -21735.184932 | -23149.222603 | -25892.106164 | -24085.352740 | -25579.356164 | -28028.681507 | -23716.034247 | -22585.178082 | -26041.647260 | -24528.452055 | -26497.989726 | -26989.006849 | -26451.924658 | -23466.119863 | -22719.020548 | -26010.842466 | -27411.592466 | -25981.900685 | -24227.486301 | -24784.301370 | -27870.982877 | -23594.640411 | -25032.654452 | 1837.416567 | 2 |
| 1 | 0.048391 | 0.010072 | 0.008317 | 0.006001 | mse | random | {'criterion': 'mse', 'splitter': 'random'} | -23991.068493 | -24867.958904 | -28433.236301 | -23161.239726 | -26519.969178 | -26594.239726 | -26938.462329 | -27090.681507 | -30462.061644 | -23729.821918 | -21862.308219 | -25963.243151 | -25860.335616 | -25523.332192 | -28919.359589 | -23077.136986 | -25613.722603 | -28600.205479 | -24291.544521 | -29937.739726 | -29452.455479 | -25077.397260 | -27594.791096 | -26179.308219 | -25288.205479 | -24888.825342 | -24472.030822 | -27762.856164 | -24236.616438 | -22652.424658 | -28950.993151 | -24003.989726 | -25598.291096 | -25943.797945 | -26706.897260 | -24751.921233 | -28387.743151 | -27661.517123 | -26412.034247 | -24262.195205 | -27053.037671 | -23280.568493 | -26230.386986 | -26739.684932 | -27662.428082 | -28208.729452 | -24533.852740 | -25996.092466 | -28011.900685 | -25349.791096 | -26095.768630 | 1965.416229 | 3 |
| 3 | 0.038317 | 0.006224 | 0.005977 | 0.001122 | friedman_mse | random | {'criterion': 'friedman_mse', 'splitter': 'ran... | -28363.339041 | -28577.092466 | -25013.770548 | -25747.188356 | -23531.541096 | -27831.845890 | -26818.585616 | -25635.585616 | -26066.767123 | -24987.191781 | -24497.092466 | -24738.904110 | -28022.934932 | -28659.589041 | -28590.386986 | -26528.760274 | -28706.739726 | -26407.260274 | -25895.280822 | -30212.369863 | -29209.565068 | -27493.421233 | -25174.445205 | -25036.054795 | -25521.082192 | -27182.106164 | -25998.386986 | -32944.328767 | -26112.880137 | -28153.441781 | -27298.075342 | -27188.767123 | -27761.993151 | -24556.547945 | -25719.054795 | -23613.941781 | -24561.921233 | -25817.667808 | -27792.465753 | -23962.671233 | -25220.376712 | -26162.243151 | -26950.763699 | -25004.232877 | -29245.023973 | -27017.773973 | -23240.434932 | -26890.390411 | -32966.229452 | -24821.811644 | -26669.006507 | 2068.841298 | 4 |
| 4 | 0.520461 | 0.055808 | 0.010374 | 0.007151 | poisson | best | {'criterion': 'poisson', 'splitter': 'best'} | -31813.089041 | -33973.811644 | -32470.582192 | -28726.047945 | -29997.284247 | -32807.106164 | -27213.232877 | -33049.027397 | -31597.482877 | -28405.458904 | -30495.157534 | -30128.065068 | -32676.184932 | -29774.363014 | -29797.448630 | -26960.842466 | -29626.318493 | -35234.934932 | -31087.893836 | -35005.359589 | -32841.349315 | -32115.934932 | -28373.071918 | -34103.130137 | -30064.202055 | -31626.068493 | -30081.945205 | -31940.602740 | -31780.602740 | -32741.606164 | -32553.705479 | -31195.160959 | -31039.390411 | -33582.047945 | -29235.821918 | -34898.143836 | -34147.164384 | -30666.705479 | -28589.866438 | -31167.750000 | -30273.469178 | -28664.034247 | -30890.174658 | -28234.571918 | -32448.270548 | -32810.972603 | -26736.345890 | -29299.263699 | -33266.767123 | -31623.904110 | -31156.634726 | 2103.208791 | 5 |
| 5 | 0.135421 | 0.034292 | 0.011115 | 0.009981 | poisson | random | {'criterion': 'poisson', 'splitter': 'random'} | -29997.017123 | -36757.352740 | -33702.369863 | -31894.684932 | -29612.068493 | -29854.845890 | -29342.434932 | -32541.243151 | -32300.010274 | -29458.804795 | -27668.726027 | -28973.465753 | -38672.708904 | -31265.102740 | -30126.301370 | -30746.407534 | -28400.551370 | -31619.462329 | -30280.589041 | -37267.746575 | -33960.023973 | -31598.561644 | -28798.626712 | -32673.315068 | -30039.517123 | -30049.441781 | -32709.212329 | -29477.219178 | -34741.517123 | -31366.003425 | -37669.191781 | -28561.876712 | -32957.955479 | -32625.582192 | -31828.424658 | -33016.106164 | -34261.660959 | -30401.863014 | -29621.907534 | -29759.294521 | -36889.106164 | -28561.390411 | -30415.688356 | -30466.900685 | -29996.547945 | -33537.825342 | -26730.277397 | -31128.808219 | -33299.095890 | -35878.986301 | -31670.076438 | 2699.023919 | 6 |
%%time
from sklearn.ensemble import RandomForestRegressor
param_dict = {
"n_estimators": [100],
"n_jobs" : [-2],
"verbose" : [1],
"random_state": [2451],
"warm_start" : [1],
"max_depth" : [28, 30, 32, None]
}
rf_gscv = GridSearchCV(
RandomForestRegressor(),
param_dict,
scoring = "neg_mean_absolute_error",
n_jobs = -2,
cv = cv,
verbose = 1,
)
rf_gscv.fit(x_train_filled, y_train)
Fitting 50 folds for each of 4 candidates, totalling 200 fits
[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers. [Parallel(n_jobs=-2)]: Done 44 tasks | elapsed: 0.7s
Wall time: 3min 41s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed: 1.7s finished
GridSearchCV(cv=RepeatedKFold(n_repeats=10, n_splits=5, random_state=1991),
estimator=RandomForestRegressor(), n_jobs=-2,
param_grid={'max_depth': [28, 30, 32, None], 'n_estimators': [100],
'n_jobs': [-2], 'random_state': [2451], 'verbose': [1],
'warm_start': [1]},
scoring='neg_mean_absolute_error', verbose=1)
pd.DataFrame(rf_gscv.cv_results_).sort_values("rank_test_score")
| mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_max_depth | param_n_estimators | param_n_jobs | param_random_state | param_verbose | param_warm_start | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | split5_test_score | split6_test_score | split7_test_score | split8_test_score | split9_test_score | split10_test_score | split11_test_score | split12_test_score | split13_test_score | split14_test_score | split15_test_score | split16_test_score | split17_test_score | split18_test_score | split19_test_score | split20_test_score | split21_test_score | split22_test_score | split23_test_score | split24_test_score | split25_test_score | split26_test_score | split27_test_score | split28_test_score | split29_test_score | split30_test_score | split31_test_score | split32_test_score | split33_test_score | split34_test_score | split35_test_score | split36_test_score | split37_test_score | split38_test_score | split39_test_score | split40_test_score | split41_test_score | split42_test_score | split43_test_score | split44_test_score | split45_test_score | split46_test_score | split47_test_score | split48_test_score | split49_test_score | mean_test_score | std_test_score | rank_test_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2.974518 | 0.555691 | 0.686110 | 0.354883 | 30 | 100 | -2 | 2451 | 1 | 1 | {'max_depth': 30, 'n_estimators': 100, 'n_jobs... | -17118.919555 | -18301.884795 | -16229.615719 | -13816.366575 | -16305.166336 | -16194.932226 | -15034.980719 | -17437.878425 | -17225.881473 | -16777.914521 | -15405.935308 | -14813.179932 | -16942.395479 | -17165.310479 | -18015.859829 | -15634.316644 | -14466.744829 | -17510.523562 | -15805.898082 | -19062.055137 | -17529.008116 | -16989.013253 | -16126.862432 | -16070.603733 | -16418.366507 | -14035.150308 | -15717.828493 | -18194.263082 | -15996.656768 | -18351.096986 | -18987.040103 | -15363.865616 | -15291.128801 | -17500.780685 | -15964.114521 | -15710.296404 | -17996.886027 | -15279.345514 | -16633.697397 | -17180.374281 | -18027.059795 | -14073.454486 | -15700.996884 | -15972.589486 | -17921.478322 | -15763.231678 | -15242.289658 | -17941.351678 | -18674.470856 | -15465.282055 | -16507.686871 | 1309.028207 | 1 |
| 3 | 2.692679 | 0.344106 | 0.345503 | 0.289754 | None | 100 | -2 | 2451 | 1 | 1 | {'max_depth': None, 'n_estimators': 100, 'n_jo... | -17118.919555 | -18301.884795 | -16229.615719 | -13816.366575 | -16305.166336 | -16194.932226 | -15034.980719 | -17437.878425 | -17225.881473 | -16777.914521 | -15405.935308 | -14813.179932 | -16942.395479 | -17165.310479 | -18015.859829 | -15634.316644 | -14466.744829 | -17510.523562 | -15805.898082 | -19062.055137 | -17529.008116 | -16989.013253 | -16126.862432 | -16070.603733 | -16418.366507 | -14035.150308 | -15717.828493 | -18181.486336 | -16011.168356 | -18351.096986 | -18987.040103 | -15363.865616 | -15291.128801 | -17500.780685 | -15964.114521 | -15710.296404 | -17996.886027 | -15279.345514 | -16633.697397 | -17180.374281 | -18027.059795 | -14073.454486 | -15700.996884 | -15972.589486 | -17921.478322 | -15763.231678 | -15242.289658 | -17941.351678 | -18674.470856 | -15465.282055 | -16507.721568 | 1308.588450 | 2 |
| 2 | 2.763852 | 0.437150 | 0.503961 | 0.293557 | 32 | 100 | -2 | 2451 | 1 | 1 | {'max_depth': 32, 'n_estimators': 100, 'n_jobs... | -17118.919555 | -18301.884795 | -16229.615719 | -13816.366575 | -16305.166336 | -16194.932226 | -15034.980719 | -17437.878425 | -17225.881473 | -16777.914521 | -15405.935308 | -14813.179932 | -16942.395479 | -17165.310479 | -18015.859829 | -15634.316644 | -14466.744829 | -17510.523562 | -15805.898082 | -19062.055137 | -17529.008116 | -16989.013253 | -16126.862432 | -16070.603733 | -16418.366507 | -14035.150308 | -15717.828493 | -18191.948664 | -16011.168356 | -18351.096986 | -18987.040103 | -15363.865616 | -15291.128801 | -17500.780685 | -15964.114521 | -15710.296404 | -17996.886027 | -15279.345514 | -16633.697397 | -17180.374281 | -18027.059795 | -14073.454486 | -15700.996884 | -15972.589486 | -17921.478322 | -15763.231678 | -15242.289658 | -17941.351678 | -18674.470856 | -15465.282055 | -16507.930814 | 1308.856881 | 3 |
| 0 | 2.743051 | 0.378772 | 0.384700 | 0.318033 | 28 | 100 | -2 | 2451 | 1 | 1 | {'max_depth': 28, 'n_estimators': 100, 'n_jobs... | -17118.919555 | -18301.884795 | -16229.615719 | -13816.366575 | -16305.166336 | -16194.932226 | -15034.980719 | -17437.878425 | -17225.881473 | -16777.914521 | -15405.935308 | -14813.179932 | -16942.395479 | -17165.310479 | -18015.859829 | -15634.316644 | -14466.744829 | -17510.523562 | -15795.806975 | -19062.055137 | -17529.008116 | -16989.013253 | -16126.862432 | -16070.603733 | -16418.366507 | -14061.151826 | -15717.828493 | -18192.260279 | -16004.376336 | -18351.096986 | -18987.040103 | -15363.865616 | -15293.964836 | -17500.780685 | -15964.114521 | -15710.296404 | -17996.886027 | -15279.345514 | -16633.697397 | -17180.374281 | -18027.059795 | -14073.454486 | -15700.996884 | -15972.589486 | -17921.478322 | -15763.231678 | -15241.610154 | -17941.351678 | -18678.842363 | -15465.282055 | -16508.249975 | 1308.153644 | 4 |
pd.Series(rf_gscv.best_estimator_.feature_importances_, index=x_train_filled.columns).sort_values(ascending=False).round(4) * 100
Total_SF 62.69 OverallQual 12.68 Neighborhood 7.76 GrLivArea 1.82 BsmtFinSF1 1.15 BsmtQual 0.83 KitchenQual 0.82 LotArea 0.79 MasVnrArea 0.74 Age_At_Sale 0.68 TotalBsmtSF 0.65 YearRemodAdd 0.56 OverallCond 0.55 GarageArea 0.55 LotFrontage 0.54 Remodel_Age 0.54 ExterQual 0.48 GarageYrBlt 0.42 All_bath 0.40 Age_Garage 0.37 TotRmsAbvGrd 0.35 MoSold 0.32 FireplaceQu 0.30 GarageCars 0.27 BsmtExposure 0.26 MSSubClass 0.26 LandContour 0.25 Exterior2nd 0.21 SaleCondition 0.21 GarageType 0.21 BsmtFinType1 0.20 Exterior1st 0.19 Fireplaces 0.18 CentralAir 0.17 YrSold 0.15 MasVnrType 0.13 GarageFinish 0.13 BedroomAbvGr 0.11 ScreenPorch 0.10 LotShape 0.10 MSZoning 0.09 LotConfig 0.08 HouseStyle 0.08 SaleType 0.07 HeatingQC 0.07 RoofStyle 0.07 Functional 0.07 BsmtFinType2 0.04 Foundation 0.04 Remodel_Done 0.04 Condition1 0.04 GarageQual 0.03 Floors_no 0.03 PavedDrive 0.03 BldgType 0.03 3SsnPorch 0.02 GarageCond 0.02 Road 0.01 LowQualFinSF 0.01 Electrical 0.01 MiscVal 0.01 dtype: float64
%%time
rf_ml = RandomForestRegressor(**rf_gscv.best_params_)
rf_ml.fit(x_train_filled, y_train)
[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers. [Parallel(n_jobs=-2)]: Done 44 tasks | elapsed: 0.7s
Wall time: 2.01 s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed: 1.8s finished
RandomForestRegressor(max_depth=30, n_jobs=-2, random_state=2451, verbose=1,
warm_start=1)
for _ in range(4):
rf_ml.n_estimators += 100
rf_ml.random_state += 101
rf_ml.fit(x_train_filled, y_train)
[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers. [Parallel(n_jobs=-2)]: Done 44 tasks | elapsed: 0.8s [Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed: 1.9s finished [Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers. [Parallel(n_jobs=-2)]: Done 44 tasks | elapsed: 0.8s [Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed: 1.8s finished [Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers. [Parallel(n_jobs=-2)]: Done 44 tasks | elapsed: 1.0s [Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed: 2.1s finished [Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 3 concurrent workers. [Parallel(n_jobs=-2)]: Done 44 tasks | elapsed: 0.8s [Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed: 2.0s finished